def train(data_loader, epochs=3): """ Given the data_loader, it fine-tunes BERT for the specific task. The BERT authors recommend between 2 and 4 training epochs. Returns fine-tuned BERT model. """ model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=2) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] # This variable contains all of the hyperparemeter information our training loop needs optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5) train_loss_set = [] # trange is a tqdm wrapper around the normal python range for _ in trange(epochs, desc="Epoch"): model.train() # Tracking variables tr_loss, nb_tr_examples, nb_tr_steps = 0, 0, 0 for batch in data_loader: batch = tuple(t.to(device) for t in batch) optimizer.zero_grad( ) # clears any previously calculated gradients before performing a backward pass b_input_ids, b_input_mask, b_labels = batch outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) loss = outputs[0] logits = outputs[1] train_loss_set.append(loss.item()) loss.backward() optimizer.step() # Update tracking variables tr_loss += loss.item() nb_tr_examples += b_input_ids.size(0) nb_tr_steps += 1 print("Train loss: {}".format(tr_loss / nb_tr_steps)) return model
def __init__(self): super(XlnetModel, self).__init__() self.xlnet = XLNetForSequenceClassification.from_pretrained( "hfl/chinese-xlnet-base", num_labels=2) # /bert_pretrain/ self.device = torch.device("cuda") for param in self.xlnet.parameters(): param.requires_grad = True # 每个参数都要 求梯度
def Get_Model(modelName): model = '' if modelName == 'XLNet': model = XLNetForSequenceClassification.from_pretrained( # Use the 12-layer BERT model, with an uncased vocab. pretrained_model_path, # The number of output labels--2 for binary classification. num_labels=2) elif modelName == 'BERT': model = BertForSequenceClassification.from_pretrained( # Use the 12-layer BERT model, with an uncased vocab. pretrained_model_path, # The number of output labels--2 for binary classification. num_labels=2) elif modelName == 'RoBerta': model = RobertaForSequenceClassification.from_pretrained( # Use the 12-layer BERT model, with an uncased vocab. pretrained_model_path, # The number of output labels--2 for binary classification. num_labels=2) elif modelName == 'Albert': model = AlbertForSequenceClassification.from_pretrained( # Use the 12-layer BERT model, with an uncased vocab. pretrained_model_path, # The number of output labels--2 for binary classification. num_labels=2) return model
def __init__(self, xlnet_pretrained_model="xlnet-base-cased", xlnet_pretrained_tokenizer=None, train_batch_size=8, eval_batch_size=8, num_labels=2, learning_rate=3e-5, train_dset=None, eval_dset=None): # define hyperparameters self.train_batch_size = train_batch_size self.eval_batch_size = eval_batch_size self.num_labels = num_labels # loading pre-trained models self.model = XLNetForSequenceClassification.from_pretrained( xlnet_pretrained_model, num_labels=num_labels).to(self.DEVICE) self.tokenizer = XLNetTokenizer.from_pretrained(xlnet_pretrained_model) # creating / loading datasets self.train_dset = train_dset self.eval_dset = eval_dset self.train_loader = DataLoader(self.train_dset, batch_size=self.train_batch_size, shuffle=True) self.eval_loader = DataLoader(self.eval_dset, batch_size=self.eval_batch_size)
def __init__(self, batchsize=16, max_len=64): RANDOM_SEED = 42 np.random.seed(RANDOM_SEED) torch.manual_seed(RANDOM_SEED) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print('device {}'.format(device)) model = XLNetForSequenceClassification.from_pretrained( 'xlnet-base-cased', num_labels=3) model = model.to(device) self.device = device self.model = model PRE_TRAINED_MODEL_NAME = 'xlnet-base-cased' self.tokenizer = XLNetTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME) self.test_size = 0.5 self.random_state = 101 self.MAX_LEN = max_len self.BATCH_SIZE = batchsize self.EPOCHS = 10 self.num_data_workers = 4 self.model_file = './models/xlnet_model_batch{}.bin'.format(batchsize) self.class_names = ['positive', 'negative', 'neutral'] #self.class_names = ['positive', 'negative'] self.columns = None
def __init__(self, config): super(Model, self).__init__() model_config = XLNetConfig.from_pretrained(config.bert_path, num_labels=config.num_classes) self.xlnet = XLNetForSequenceClassification.from_pretrained(config.bert_path, config=model_config) for param in self.bert.parameters(): param.requires_grad = True self.fc = nn.Linear(config.hidden_size, config.num_classes)
def get_predictions(self, sentences): """ Get the string predictions for each sentence :param sentences: the sentences :return: a dataframe containing the sentences and the predictions """ """ Makes prediction on sentences :param sentences: the sentences :return: a dataframe a dataframe with sentences and predictions """ self.tag2idx = get_existing_tag2idx(self.model_folder) tag2name = {self.tag2idx[key]: key for key in self.tag2idx.keys()} model = XLNetForSequenceClassification.from_pretrained( self.model_folder, num_labels=len(tag2name)) model.to(self.device) model.eval() logger.info("Setting input embedding") input, masks, segs = generate_dataloader_input(sentences) dataloader = get_dataloader(input, masks, segs, BATCH_NUM) nb_eval_steps, nb_eval_examples = 0, 0 y_predict = [] logger.info("Running evaluation...") for step, batch in enumerate(dataloader): if nb_eval_steps % 100 == 0: logger.info(f"Step {nb_eval_steps}") batch = tuple(t.to(self.device) for t in batch) b_input_ids, b_input_mask, b_segs = batch with torch.no_grad(): outputs = model( input_ids=b_input_ids, token_type_ids=b_segs, input_mask=b_input_mask, ) logits = outputs[0] # Get text classification predict result logits = logits.detach().cpu().numpy() for predict in np.argmax(logits, axis=1): y_predict.append(predict) nb_eval_steps += 1 final_df = pd.DataFrame({ "sentences": sentences, "label": [tag2name[pred] for pred in y_predict], "y_pred": y_predict }) return final_df
def __init__(self, model_name, model_type): """ Hyper-parameters found with validation set: xlnet-large-casd : epoch = 4, learning_rate = 1E-5, batch_size = 16, epsilon = 1e-6 bert-large-uncased : epoch = 4, learning_rate = 3E-5, batch_size = 16, epsilon = 1e-8 ALBERT xxlarge-v2 large : epoch = 3, learning_rate = 5E-5, batch_size = 8, epsilon = 1e-6 to be improved... """ self.model_name = model_name self.model_type = model_type # Cf transformers library, batch of 16 or 32 is advised for training. For memory issues, we will take 16. Gradient accumulation step has not lead # to great improvment and therefore won't be used here. if model_type == 'albert': self.batch_size = 8 else: self.batch_size = 16 available_model_name = ["xlnet-large-cased", "bert-large-uncased", "albert-xlarge-v2"] available_model_type = ["bert", "xlnet", "albert"] if self.model_name not in available_model_name: raise Exception("Error : model_name should be in", available_model_name) if self.model_type not in available_model_type: raise Exception("Error : model_name should be in", available_model_type) # Load BertForSequenceClassification, the pretrained BERT model with a single linear regression layer on top of the pooled output # Load our fined tune model: ex: BertForSequenceClassification.from_pretrained('./my_saved_model_directory/') if self.model_type == 'bert': self.config = BertConfig.from_pretrained(self.model_name, num_labels=1) # num_labels=1 for regression task self.model = BertForSequenceClassification.from_pretrained(self.model_name, config=self.config) elif self.model_type == 'xlnet': self.config = XLNetConfig.from_pretrained(self.model_name, num_labels=1) self.model = XLNetForSequenceClassification.from_pretrained(self.model_name, config=self.config) elif self.model_type == 'albert': self.config = AlbertConfig.from_pretrained(self.model_name, num_labels=1) self.model = AlbertForSequenceClassification.from_pretrained(self.model_name, config=self.config) self.model.cuda() if self.model_name == 'xlnet-large-cased': self.epochs = 4 self.lr = 1e-5 self.eps = 1e-6 elif self.model_name == 'bert-large-uncased': self.epochs = 4 self.lr = 3e-5 self.eps = 1e-8 elif self.model_name == 'albert-xxlarge-v2': self.epochs = 3 self.lr = 5e-5 self.eps = 1e-6 self.max_grad_norm = 1.0 # Gradient threshold, gradients norms that exceed this threshold are scaled down to match the norm. self.optimizer = AdamW(self.model.parameters(), lr=self.lr, eps=self.eps) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.n_gpu = torch.cuda.device_count() torch.cuda.get_device_name(0)
def load_model(): checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(checkpoint_dir + '/**/' + WEIGHTS_NAME, recursive=True))) model = XLNetForSequenceClassification.from_pretrained(checkpoints[0]) tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') model.to(device) model.eval() return (model, tokenizer)
def create_from_pretrained(task_type, xlnet_model_name, cache_dir, num_labels): if task_type == TaskType.CLASSIFICATION: model = XLNetForSequenceClassification.from_pretrained( pretrained_model_name_or_path=xlnet_model_name, cache_dir=cache_dir, num_labels=num_labels) # delete the regression task because sentiment analysis doesn't have regression else: raise KeyError(task_type) return model
def load_model(self,model_path): model = XLNetForSequenceClassification.from_pretrained( model_path, # Use the 12-layer BERT model num_labels = self.args['num_classes'], # The number of output labels--2 for binary classification output_attentions = False, # Whether the model returns attentions weights. output_hidden_states = False # Whether the model returns all hidden-states. ) if torch.cuda.is_available(): model.cuda(self.device) return model
def __init__(self, requires_grad=True): super(XlnetModel, self).__init__() self.xlnet = XLNetForSequenceClassification.from_pretrained( 'xlnet-large-cased', num_labels=2) self.tokenizer = AutoTokenizer.from_pretrained('xlnet-large-cased', do_lower_case=True) self.requires_grad = requires_grad self.device = torch.device("cuda") for param in self.xlnet.parameters(): param.requires_grad = requires_grad # Each parameter requires gradient
def main(): # コマンドライン引数の取得(このファイル上部のドキュメントから自動生成) args = docopt(__doc__) pprint(args) # パラメータの取得 lr = float(args['--lr']) seq_len = int(args['--seq_len']) max_epoch = int(args['--max_epoch']) batch_size = int(args['--batch_size']) num_train = int(args['--num_train']) num_valid = int(args['--num_valid']) # モデルの選択 pretrained_weights = 'xlnet-base-cased' tokenizer = XLNetTokenizer.from_pretrained(pretrained_weights) config = XLNetConfig.from_pretrained(pretrained_weights, num_labels=4) model = XLNetForSequenceClassification.from_pretrained(pretrained_weights) print(model.config.num_labels) # 使用デバイスの取得 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.to(device) # データの読み込みとデータセットの作成 encoder = TwinPhraseEncoder(tokenizer, seq_len) train_dataset = WordnetDataset(mode='train', num_data=num_train, transform=encoder) valid_dataset = WordnetDataset(mode='valid', num_data=num_valid, transform=encoder) train_loader = data.DataLoader(train_dataset, batch_size, shuffle=True) valid_loader = data.DataLoader(valid_dataset, batch_size, shuffle=True) # 最適化法の定義 optimizer = optim.Adam(model.parameters(), lr=lr) # 学習 for epoch in range(1, max_epoch + 1): print('=' * 27 + f' Epoch {epoch:0>2} ' + '=' * 27) # Training loss, accu = train_model(model, optimizer, train_loader, device) print( f'| Training | loss-avg : {loss:>8.6f} | accuracy : {accu:>8.3%} |' ) # Validation loss, accu = valid_model(model, optimizer, valid_loader, device) print( f'| Validation | loss-avg : {loss:>8.6f} | accuracy : {accu:>8.3%} |' ) # 保存 torch.save(model.state_dict(), f'../result/{pretrained_weights}.pkl')
def __init__(self, pretrained_model_path, num_classes, device, d_model=1024, n_layer=24, n_head=16, \ d_inner=4096, ff_activation='gelu', untie_r=True, attn_type='bi',initializer_range=0.02, \ layer_norm_eps=1e-12, dropout=0.1): super(XLNetForTextClassification, self).__init__() print('Reloading pretrained models...') self.tokenizer = XLNetTokenizer.from_pretrained(pretrained_model_path) self.tokenizer.model_max_length = 512 self.model = XLNetForSequenceClassification.from_pretrained( pretrained_model_path, num_labels=num_classes).to(device) self.softmax = torch.nn.Softmax(dim=1) self.device = device
def __init__(self, requires_grad=True, num_labels=2): super(XlnetModel, self).__init__() self.num_labels = num_labels self.xlnet = XLNetForSequenceClassification.from_pretrained( 'hfl/chinese-xlnet-base', num_labels=self.num_labels) self.tokenizer = XLNetTokenizer.from_pretrained( 'hfl/chinese-xlnet-base', do_lower_case=True) # self.xlnet = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased', num_labels = self.num_labels) # self.tokenizer = AutoTokenizer.from_pretrained('xlnet-large-cased', do_lower_case=True) self.requires_grad = requires_grad self.device = torch.device("cuda") for param in self.xlnet.parameters(): param.requires_grad = requires_grad # 每个参数都要求梯度
def make_model(args, device): if args.model == "roberta": config = RobertaConfig.from_pretrained("roberta-base") config.num_labels = 5 if args.dataset == "imdb": config.num_labels = 2 if args.dataset == "ag_news": config.num_labels = 4 if args.dataset == "yahoo": config.num_labels = 10 pretrained_model = RobertaForSequenceClassification.from_pretrained( "roberta-base", config=config) return scl_model_Roberta(config, device, pretrained_model, with_semi=args.with_mix, with_sum=args.with_summary) if args.model == "bert": config = BertConfig.from_pretrained("bert-base-uncased") config.num_labels = 5 if args.dataset == "imdb": config.num_labels = 2 if args.dataset == "ag_news": config.num_labels = 4 if args.dataset == "yahoo": config.num_labels = 10 pretrained_model = BertForSequenceClassification.from_pretrained( "bert-base-uncased", config=config) return scl_model_Bert(config, device, pretrained_model, with_semi=args.with_mix, with_sum=args.with_summary) if args.model == "xlnet": config = XLNetConfig.from_pretrained("xlnet-base-cased") config.num_labels = 5 if args.dataset == "imdb": config.num_labels = 2 if args.dataset == "ag_news": config.num_labels = 4 if args.dataset == "yahoo": config.num_labels = 10 pretrained_model = XLNetForSequenceClassification.from_pretrained( "xlnet-base-cased", config=config) return scl_model_Xlnet(config, device, pretrained_model, with_semi=args.with_mix, with_sum=args.with_summary)
def pick_model(model_name, num_labels): """ Return specified model: Available model names: ['albert-base-v2'\ , 'bert-base-uncased', 'bert-large-uncased'\ , 'roberta-base', 'xlnet-base-cased', ] """ if model_name == 'albert-base-v2': model = AlbertForSequenceClassification.from_pretrained( model_name, num_labels=num_labels, output_attentions= False, # Whether the model returns attentions weights. output_hidden_states= False, # Whether the model returns all hidden-states. ) if model_name in ('bert-base-uncased', 'bert-large-uncased'): model = BertForSequenceClassification.from_pretrained( model_name, num_labels=num_labels, output_attentions= False, # Whether the model returns attentions weights. output_hidden_states= False, # Whether the model returns all hidden-states. ) if model_name in ('roberta-base', "roberta-large", "roberta-large-mnli"): model = RobertaForSequenceClassification.from_pretrained( model_name, num_labels=num_labels, output_attentions= False, # Whether the model returns attentions weights. output_hidden_states= False, # Whether the model returns all hidden-states. ) if model_name == 'xlnet-base-cased': model = XLNetForSequenceClassification.from_pretrained( model_name, num_labels=num_labels, output_attentions= False, # Whether the model returns attentions weights. output_hidden_states= False, # Whether the model returns all hidden-states. ) print(f'Loaded {model_name} model.') if torch.cuda.is_available(): model.cuda() return model
def finetune(self, input_text: List[str], output: List[str], max_input_length=128, validation_split: float = 0.15, epochs: int = 20, batch_size: int = None, early_stopping: bool = True, trainer: pl.Trainer = None): """ Finetune XLNet for text classification. input_text and output must be ordered 1:1 Unique data classes automatically determined from output data Args: input_text: List of strings to classify (must match output ordering) output: List of input classifications (must match input ordering) max_input_length: Maximum number of tokens to be supported as input. Caps at 512. validation_split: Float between 0 and 1 that determines what percentage of the data to use for validation epochs: Integer that specifies how many iterations of training to do batch_size: Leave as None to determine the batch size automatically epochs: Integer that specifies how many iterations of training to do batch_size: Leave as None to determine the batch size automatically early_stopping: Boolean that determines whether to automatically stop when validation loss stops improving trainer: Your custom pytorch_lightning trainer """ assert len(input_text) == len(output) OPTIMAL_BATCH_SIZE = 128 labels = set(output) self.labels = {k: v for k, v in enumerate(labels)} class_to_idx = {v: k for k, v in enumerate(labels)} self.model = XLNetForSequenceClassification.from_pretrained( self.model_path, num_labels=len(labels)) print("Processing data...") dataset = zip(input_text, output) dataset = [(self.encode(r[0], class_to_idx[r[1]], max_input_length)) for r in dataset] Finetunable.finetune(self, dataset, validation_split=validation_split, epochs=epochs, optimal_batch_size=OPTIMAL_BATCH_SIZE, early_stopping=early_stopping, trainer=trainer)
def __init__(self, bert_config, device, n_class): """ :param bert_config: str, BERT configuration description :param device: torch.device :param n_class: int """ super(DefaultModel, self).__init__() self.n_class = n_class self.bert_config = bert_config self.bert = XLNetForSequenceClassification.from_pretrained(self.bert_config, num_labels=self.n_class, output_hidden_states= False) self.tokenizer = XLNetTokenizer.from_pretrained(self.bert_config) self.device = device
def demo5(): from transformers import XLNetConfig, XLNetModel, XLNetTokenizer, XLNetForSequenceClassification import torch # 定义路径,初始化tokenizer XLN_PATH = r"D:\transformr_files\XLNetLMHeadModel" tokenizer = XLNetTokenizer.from_pretrained(XLN_PATH) # 加载配置 model_config = XLNetConfig.from_pretrained(XLN_PATH) # 设定类别数为3 model_config.num_labels = 3 # 直接从xlnet的config新建XLNetForSequenceClassification(和上一节方法等效) cls_model = XLNetForSequenceClassification.from_pretrained( XLN_PATH, config=model_config) # 设定模式 model.eval() token_codes = tokenizer.encode_plus("i like you, what about you")
def main(argv, arc): assert len( argv) == 4, 'input should be :test_data, output_path, model_path ' test_path = argv[1] model_name = argv[2] output_path = argv[3] test_df = pd.read_csv(test_path, dtype={'A': 'str', 'B': 'str'}) if 'Unnamed: 0' in test_df.columns: test_df = test_df.drop(['Unnamed: 0'], axis=1) print(len(test_df), end='\n') tokenizer = XLNetTokenizer.from_pretrained(pre_trained_model_name, do_lower_case=True) testset = DialogueDataset(test_df, 'test', tokenizer=tokenizer) # first way # with open(f'./model/{model_name}', 'rb') as input_model: # model = pickle.load(input_model) # second way NUM_LABELS = 2 model = xlnet_model() model.model = XLNetForSequenceClassification.from_pretrained( pre_trained_model_name, num_labels=NUM_LABELS) # model.model = BertForNextSentencePrediction.from_pretrained(pre_trained_model_name) model.model.load_state_dict( torch.load(f'{model_name}', map_location=f'cuda:{device}')) print(model.val_accu_list) preds = model.predict(testset) test_df['prob'] = preds groups = test_df.groupby('question') ans = [] for index, data in groups: if 'candidate_id' in test_df.columns: ans.append(data.loc[data['prob'].idxmax(), 'candidate_id']) else: ans.append(data.loc[data['prob'].idxmax(), 'B']) pred_df = pd.DataFrame() # pred_df['id'] = [f'{i}' for i in range(80000,82000)] pred_df['id'] = [f'{80000 + i}' for i in range(0, len(ans))] # pred_df['id'] = [82000] pred_df['candidate-id'] = ans pred_df.to_csv(output_path, index=False)
def run_xlnet(device, results_file): set_seed(args.seed) torch.cuda.empty_cache() #get the data logging.info('Constructing datasets...') train_data, dev_data, test_data = read_samples_xlnet() #prepare the model and data model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=args.num_label, output_attentions=False, output_hidden_states=False) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}] optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr, eps=1e-6) epoch = args.epochs train_iter = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=32) dev_iter = DataLoader(dev_data, sampler=SequentialSampler(dev_data), batch_size=32) test_iter = DataLoader(test_data, sampler=SequentialSampler(test_data), batch_size=32) #create model save directory checkpoint_dir = os.path.join(args.checkpoint_dir, args.model_name) if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) #run the tests logging.info( "Number of training samples {train}, number of dev samples {dev}, number of test samples {test}".format( train=len(train_data), dev=len(dev_data), test=len(test_data))) train_xlnet(epoch, model, train_iter, dev_iter, optimizer, device, checkpoint_dir, results_file) model = load_model(checkpoint_dir) acc, f1, recall, prec, f1_ave, recall_ave, prec_ave = test_xlnet(test_iter, model, device) del model return acc, f1, recall, prec, f1_ave, recall_ave, prec_ave
def _config_model(model_name: AvailableClassificationModels, num_labels: int, use_gpu: bool): model_name = str(model_name.value) model = None if 'bert' in model_name: model = BertForSequenceClassification.from_pretrained( model_name, num_labels=num_labels, output_attentions=True) elif 'xlnet' in model_name: model = XLNetForSequenceClassification.from_pretrained( model_name, num_labels=num_labels, output_attentions=True) elif 'roberta' in model_name: model = RobertaForSequenceClassification.from_pretrained( model_name, num_labels=num_labels, output_attentions=True) if use_gpu: model.cuda() return model
def __init__(self, hyperparams): """ :param hyperparams: list of paranters :type hyperparams: dict pretrained_weights in ['xlnet-base-cased', 'xlnet-large-cased'] more in https://huggingface.co/transformers/pretrained_models.html """ set_seed(hyperparams["random_state"], hyperparams["n_gpu"]) pretrained_weights = hyperparams['pretrained_weights'] self.tokenizer = XLNetTokenizer.from_pretrained(pretrained_weights) hyperparams["tokenizer"] = self.tokenizer self.hyperparams = hyperparams self.model = XLNetForSequenceClassification.from_pretrained( pretrained_weights, num_labels=3) self.processor = NLIProcessor(hyperparams)
def __init__(self, model_file, batchsize=48, max_len=64): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print('device {}'.format(device)) model = XLNetForSequenceClassification.from_pretrained( 'xlnet-base-cased', num_labels=3) model.load_state_dict(torch.load(model_file)) model = model.to(device) self.device = device self.model = model PRE_TRAINED_MODEL_NAME = 'xlnet-base-cased' self.tokenizer = XLNetTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME) self.MAX_LEN = max_len self.BATCHSIZE = batchsize self.class_names = ['positive', 'negative', 'neutral']
def model_sel(model_type='Bert'): if model_type == 'XLNet': print("Model is XLNet") tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') model = XLNetForSequenceClassification.from_pretrained( 'xlnet-large-cased', num_labels= 5, # The number of output labels--2 for binary classification. # You can increase this for multi-class tasks. output_attentions= False, # Whether the model returns attentions weights. output_hidden_states= False, # Whether the model returns all hidden-states. ) model.cuda() elif model_type == 'Bert': print("Model is Bert") # Load BertForSequenceClassification, the pretrained BERT model with a single # linear classification layer on top. tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) model = BertForSequenceClassification.from_pretrained( "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab. num_labels= 5, # The number of output labels--2 for binary classification. # You can increase this for multi-class tasks. output_attentions= False, # Whether the model returns attentions weights. output_hidden_states= False, # Whether the model returns all hidden-states. ) # Tell pytorch to run this model on the GPU. model.cuda() return model, tokenizer
def load_model(pretrained_name, model_loc=None, load_tuned=True, num_labels=2): assert pretrained_name is not None if load_tuned: # load previously tuned model from disk if model_loc is None: model_dump_loc, model_state_dic_loc = generate_disk_location() else: model_dump_loc = model_loc model = torch.load(model_dump_loc) logger.info("loading model from {}".format(model_dump_loc)) else: # load pretrained name from hugging face model_name = config["model_name"] if model_name == "bert": model = BertForSequenceClassification.from_pretrained( pretrained_name, num_labels=num_labels) elif model_name == "roberta": model = RobertaForSequenceClassification.from_pretrained( pretrained_name, num_labels=num_labels) elif model_name == "distillbert": model = DistilBertForSequenceClassification.from_pretrained( pretrained_name, num_labels=num_labels) elif model_name == "xlmroberta": model = XLMRobertaForSequenceClassification.from_pretrained( pretrained_name, num_labels=num_labels) elif model_name == "xlnet": model = XLNetForSequenceClassification.from_pretrained( pretrained_name, num_labels=num_labels) else: logger.error("unsupported model: {}".format(model_name)) logger.info("loading pretrained model") tokenizer = AutoTokenizer.from_pretrained(pretrained_name) logger.info("model config: {}".format(model.config)) return model, tokenizer
validation_sampler = SequentialSampler(val_set) validation_dataloader = DataLoader(val_set, sampler=validation_sampler, batch_size=batch_size, collate_fn=collate_fn, num_workers=8) # specify GPU device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device.type == 'cuda': print(torch.cuda.get_device_name(0)) num_labels = 2 model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=num_labels) model.to(device) # BERT fine-tuning parameters param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0
train_labels = torch.tensor(train_labels) validation_labels = torch.tensor(validation_labels) train_masks = torch.tensor(train_masks) validation_masks = torch.tensor(validation_masks) batch_size = 32 train_data = TensorDataset(train_inputs, train_masks, train_labels) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels) validation_sampler = SequentialSampler(validation_data) validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size) model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=3) # gali buti ir 4 klases model.cuda() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5) def flat_accuracy(preds, labels): pred_flat = np.argmax(preds, axis=1).flatten()
def main(): # Set device for PyTorch if torch.cuda.is_available(): # might need to update when using more than 1 GPU rank = 0 torch.cuda.set_device(rank) device = torch.device("cuda", rank) #torch.distributed.init_process_group(backend='nccl') n_gpu = torch.cuda.device_count() else: device = torch.device("cpu") n_gpu = 0 print("N GPU: ", n_gpu) # Parse arguments parser = argparse.ArgumentParser() parser.add_argument( "--feature_save_dir", type=str, help= "Preprocessed data (features) should be saved at '/gpfs/data/razavianlab/capstone19/preprocessed_data/{feature_save_dir}'. " ) parser.add_argument("--set_type", type=str, help="Specify train/test file.") args = parser.parse_args() # Load training data feature_save_path = os.path.join( '/gpfs/data/razavianlab/capstone19/preprocessed_data/', args.feature_save_dir) logger.info("Loading {} dataset".format(args.set_type)) dataloader = load_featurized_examples(batch_size=32, set_type=args.set_type, feature_save_path=feature_save_path) # Load saved model config = XLNetConfig.from_pretrained('xlnet-base-cased', num_labels=2292) model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', config=config) model.to(device) model = torch.nn.DataParallel(model, device_ids=list(range(n_gpu))) summaries = torch.empty(0, config.d_model).to(device) labels = torch.empty(0, config.num_labels).to(device) for i, batch in enumerate(dataloader): model.eval() with torch.no_grad(): input_ids, input_mask, segment_ids, label_ids = batch input_ids = input_ids.to(device).long() input_mask = input_mask.to(device).long() segment_ids = segment_ids.to(device).long() label_ids = label_ids.to(device).float() transformer_outputs = model.module.transformer( input_ids=input_ids, token_type_ids=segment_ids, input_mask=input_mask) output = transformer_outputs[0] # extracting the CLS token summary = output[:, 0] summary = summary.to(device) summaries = torch.cat([summaries, summary], dim=0) labels = torch.cat([labels, label_ids]) if i % 1000 == 0 and i > 0: logger.info("Embedded and summarized batch {} of {}".format( i, len(dataloader))) # Save the embedded representations of the document every 50,000 batches to save memory if i % 12000 == 0 and i > 0: logger.info("Saving summaries...") torch.save( summaries, os.path.join( feature_save_path, args.set_type + '_summaries_{}.pt'.format(int(i / 12000)))) torch.save( labels, os.path.join( feature_save_path, args.set_type + '_label_ids_{}.pt'.format(int(i / 12000)))) summaries = torch.empty(0, config.d_model).to(device) labels = torch.empty(0, config.num_labels).to(device) # Save any remaining embedded representations if i % 12000 != 0: logger.info("Saving summaries...") torch.save( summaries, os.path.join( feature_save_path, args.set_type + '_summaries_{}.pt'.format(int(math.ceil(i / 12000))))) torch.save( labels, os.path.join( feature_save_path, args.set_type + '_label_ids_{}.pt'.format(int(math.ceil(i / 12000))))) return