def load(args, checkpoint_dir): state_dict = torch.load(os.path.join(checkpoint_dir, 'checkpoint.pth')) from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): if 'module' in k: namekey = k[7:] # remove `module.` else: namekey = k new_state_dict[namekey] = v if args.model_type == 'bert': config = BertConfig.from_json_file(os.path.join(checkpoint_dir, 'config.bin')) model = BertForSequenceClassification(config) model.load_state_dict(new_state_dict) elif args.model_type == 'cnn': model = CNNModel(n_vocab=args.vocab_size, embed_size=args.embed_size, num_classes=args.num_labels, num_filters=args.num_filters, filter_sizes=args.filter_sizes, device=args.device) model.load_state_dict(new_state_dict) elif args.model_type == 'lstm': model = LSTMModel(n_vocab=args.vocab_size, embed_size=args.embed_size, num_classes=args.num_labels, hidden_size=args.hidden_size, device=args.device) model.load_state_dict(new_state_dict) elif args.model_type == 'char-cnn': model = CharCNN(num_features=args.num_features, num_classes=args.num_labels) model.load_state_dict(new_state_dict) else: raise ValueError('model type is not found!') return model.to(args.device)
def main(args): """ 주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다. """ seed_everything(args.seed) use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") # load tokenizer TOK_NAME = args.token if TOK_NAME == "monologg/kobert": tokenizer = KoBertTokenizer.from_pretrained(TOK_NAME) else: tokenizer = AutoTokenizer.from_pretrained(TOK_NAME) # load my model bert_config = BertConfig.from_pretrained(TOK_NAME) bert_config.num_labels = args.num_labels bert_config.num_hidden_layers = args.num_hidden_layers model = BertForSequenceClassification(bert_config) model_dir = os.path.join(args.model_dir, args.name) model_path = os.path.join(model_dir, 'best.pth') # load test datset test_dataset_dir = "/opt/ml/input/data/test/test.tsv" test_dataset, test_label = load_test_dataset(test_dataset_dir, model, tokenizer, args) test_dataset = RE_Dataset(test_dataset, test_label) model.load_state_dict(torch.load(model_path, map_location=device)) model.to(device) # predict answer batch_size = args.batch_size print("Inference Start!!!") pred_answer = inference(model, test_dataset, device, batch_size) # make csv file with predicted answer # 아래 directory와 columns의 형태는 지켜주시기 바랍니다. output = pd.DataFrame(pred_answer, columns=['pred']) save_dir = os.path.join(args.output_dir, args.name) os.makedirs(save_dir, exist_ok=True) output.to_csv(os.path.join(save_dir, f'{args.name}.csv'), index=False)
def model_infer(config,test_load,k): print("***********load model weight*****************") model_config = model_config = BertConfig() model_config.vocab_size = len(pd.read_csv('../user_data/vocab',names=["score"])) model = BertForSequenceClassification(config=model_config) model.load_state_dict(torch.load('../user_data/save_model/{}_best_model.pth.tar'.format(config.model_name))['status']) model = model.to(config.device) print("***********make predict for test file*****************") model.eval() predict_all = [] with torch.no_grad(): for batch, (input_ids, token_type_ids, attention_mask, label) in enumerate(test_load): input_ids = input_ids.to(config.device) attention_mask = attention_mask.to(config.device) token_type_ids = token_type_ids.to(config.device) outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) logits = outputs.logits pred_pob = torch.nn.functional.softmax(logits, dim=1)[:, 1] predict_all.extend(list(pred_pob.detach().cpu().numpy())) # submit_result(predict) if k==0: df=pd.DataFrame(predict_all,columns=["{}_socre".format(k+1)]) df.to_csv('./{}_result.csv'.format(config.model_name),index=False) else: df=pd.read_csv('./{}_result.csv'.format(config.model_name)) df["{}_socre".format(k+1)] = predict_all df.to_csv('./{}_result.csv'.format(config.model_name),index=False) print("***********done*****************")
def load(args, checkpoint_dir): state_dict = torch.load(os.path.join(checkpoint_dir, 'checkpoint.pth')) from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): if 'module' in k: namekey = k[7:] # remove `module.` else: namekey = k new_state_dict[namekey] = v if args.model_type == 'bert': config = BertConfig.from_json_file( os.path.join(checkpoint_dir, 'config.bin')) model = BertForSequenceClassification(config) model.load_state_dict(new_state_dict) elif args.model_type == 'bow': model = BOWModel(new_state_dict['embedding.weight'], n_vocab=args.vocab_size, embed_size=args.embed_size, hidden_size=args.hidden_size, num_classes=args.num_labels) model.load_state_dict(new_state_dict) elif args.model_type == 'decom_att': model = DecompAttentionModel(args.word_mat, n_vocab=args.vocab_size, embed_size=args.embed_size, hidden_size=args.hidden_size, num_classes=args.num_labels) model.load_state_dict(new_state_dict) elif args.model_type == 'esim': model = ESIM(vocab_size=args.vocab_size, embedding_dim=args.embed_size, hidden_size=args.hidden_size, embeddings=None, padding_idx=0, dropout=0.1, num_classes=args.num_labels, device=args.device) model.load_state_dict(new_state_dict) else: raise ValueError('model type is not found!') return model.to(args.device)
label = torch.tensor(data=label).type(torch.LongTensor) return input_ids, token_type_ids, attention_mask, label print("***********load test data*****************") config = roBerta_Config() vocab = Vocab() train_data, valid_data, test_data = vocab.get_train_dev_test() test_dataset = BuildDataSet(test_data) test_load = DataLoader(dataset=test_dataset, batch_size=config.batch_size, shuffle=False, collate_fn=collate_fn) print("***********load model weight*****************") model_config = BertConfig.from_pretrained( pretrained_model_name_or_path="bert_source/bert_config.json") model = BertForSequenceClassification(config=model_config) model.load_state_dict(torch.load('save_bert/best_model.pth.tar')) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = model.to(device) config.device = device print("***********make predict for test file*****************") predict = model_infer(model, config, test_load) submit_result(predict) print("***********done*****************")
def train_process(config, train_load, train_sampler, model_name): # load source bert weights model_config = BertConfig.from_pretrained( pretrained_model_name_or_path="../user_data/bert_source/{}_config.json" .format(model_name)) # model_config = BertConfig() model_config.vocab_size = len( pd.read_csv('../user_data/vocab', names=["score"])) model = BertForSequenceClassification(config=model_config) checkpoint = torch.load( '../user_data/save_bert/{}_checkpoint.pth.tar'.format(model_name), map_location=torch.device('cpu')) model.load_state_dict(checkpoint['status'], strict=False) print('***********load pretrained mlm {} weight*************'.format( model_name)) for param in model.parameters(): param.requires_grad = True # 4) 封装之前要把模型移到对应的gpu model = model.to(config.device) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": config.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate) # t_total = len(train_load) * config.num_train_epochs # scheduler = get_linear_schedule_with_warmup( # optimizer, num_warmup_steps=t_total * config.warmup_proportion, num_training_steps=t_total # ) cudnn.benchmark = True if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") # 5)封装 model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[config.local_rank]) model.train() if config.fgm: fgm = FGM(model) for epoch in range(config.num_train_epochs): train_sampler.set_epoch(epoch) torch.cuda.empty_cache() for batch, (input_ids, token_type_ids, attention_mask, label) in enumerate(train_load): input_ids = input_ids.cuda(config.local_rank, non_blocking=True) attention_mask = attention_mask.cuda(config.local_rank, non_blocking=True) token_type_ids = token_type_ids.cuda(config.local_rank, non_blocking=True) label = label.cuda(config.local_rank, non_blocking=True) outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=label) loss = outputs.loss model.zero_grad() loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm) if config.fgm: fgm.attack() # 在embedding上添加对抗扰动 loss_adv = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=label).loss loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 fgm.restore() # 恢复embedding参数 optimizer.step() # scheduler.step() # dev_auc = model_evaluate(config, model, valid_load) # 同步各个进程的速度,计算分布式loss torch.distributed.barrier() # reduce_dev_auc = reduce_auc(dev_auc, config.nprocs).item() # if reduce_dev_auc > best_dev_auc: # best_dev_auc = reduce_dev_auc # is_best = True now = strftime("%Y-%m-%d %H:%M:%S", localtime()) msg = 'model_name:{},time:{},epoch:{}/{}' if config.local_rank in [0, -1]: print( msg.format(model_name, now, epoch + 1, config.num_train_epochs)) checkpoint = {"status": model.module.state_dict()} torch.save( checkpoint, '../user_data/save_model' + os.sep + '{}_checkpoint.pth.tar'.format(model_name)) del checkpoint torch.distributed.barrier()
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_label = torch.tensor([f.label for f in features], dtype=torch.long) test_dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_label) bert_config = BertConfig.from_pretrained( './input/config/bert-base-chinese-config.json') bert_config.num_labels = len(processor.get_labels()) model = BertForSequenceClassification(bert_config) model.load_state_dict(torch.load('./output/best_sim.bin')) model = model.to(device) test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=256) total_loss = 0. # loss 的总和 total_sample_num = 0 # 样本总数目 all_real_label = [] # 记录所有的真实标签列表 all_pred_label = [] # 记录所有的预测标签列表 for batch in tqdm(test_dataloader, desc="testing"): model.eval() batch = tuple(t.to(device) for t in batch)
class TorchBertClassifierModel(TorchModel): """Bert-based model for text classification on PyTorch. It uses output from [CLS] token and predicts labels using linear transformation. Args: n_classes: number of classes pretrained_bert: pretrained Bert checkpoint path or key title (e.g. "bert-base-uncased") one_hot_labels: set True if one-hot encoding for labels is used multilabel: set True if it is multi-label classification return_probas: set True if return class probabilites instead of most probable label needed attention_probs_keep_prob: keep_prob for Bert self-attention layers hidden_keep_prob: keep_prob for Bert hidden layers optimizer: optimizer name from `torch.optim` optimizer_parameters: dictionary with optimizer's parameters, e.g. {'lr': 0.1, 'weight_decay': 0.001, 'momentum': 0.9} clip_norm: clip gradients by norm coefficient bert_config_file: path to Bert configuration file (not used if pretrained_bert is key title) """ def __init__(self, n_classes, pretrained_bert, one_hot_labels: bool = False, multilabel: bool = False, return_probas: bool = False, attention_probs_keep_prob: Optional[float] = None, hidden_keep_prob: Optional[float] = None, optimizer: str = "AdamW", optimizer_parameters: dict = { "lr": 1e-3, "weight_decay": 0.01, "betas": (0.9, 0.999), "eps": 1e-6 }, clip_norm: Optional[float] = None, bert_config_file: Optional[str] = None, **kwargs) -> None: self.return_probas = return_probas self.one_hot_labels = one_hot_labels self.multilabel = multilabel self.pretrained_bert = pretrained_bert self.bert_config_file = bert_config_file self.attention_probs_keep_prob = attention_probs_keep_prob self.hidden_keep_prob = hidden_keep_prob self.n_classes = n_classes self.clip_norm = clip_norm if self.multilabel and not self.one_hot_labels: raise RuntimeError( 'Use one-hot encoded labels for multilabel classification!') if self.multilabel and not self.return_probas: raise RuntimeError( 'Set return_probas to True for multilabel classification!') super().__init__(optimizer=optimizer, optimizer_parameters=optimizer_parameters, **kwargs) def train_on_batch(self, features: List[InputFeatures], y: Union[List[int], List[List[int]]]) -> Dict: """Train model on given batch. This method calls train_op using features and y (labels). Args: features: batch of InputFeatures y: batch of labels (class id or one-hot encoding) Returns: dict with loss and learning_rate values """ input_ids = [f.input_ids for f in features] input_masks = [f.attention_mask for f in features] b_input_ids = torch.cat(input_ids, dim=0).to(self.device) b_input_masks = torch.cat(input_masks, dim=0).to(self.device) b_labels = torch.from_numpy(np.array(y)).to(self.device) self.optimizer.zero_grad() loss, logits = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_masks, labels=b_labels) loss.backward() # Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. if self.clip_norm: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip_norm) self.optimizer.step() if self.lr_scheduler is not None: self.lr_scheduler.step() return {'loss': loss.item()} def __call__( self, features: List[InputFeatures] ) -> Union[List[int], List[List[float]]]: """Make prediction for given features (texts). Args: features: batch of InputFeatures Returns: predicted classes or probabilities of each class """ input_ids = [f.input_ids for f in features] input_masks = [f.attention_mask for f in features] b_input_ids = torch.cat(input_ids, dim=0).to(self.device) b_input_masks = torch.cat(input_masks, dim=0).to(self.device) with torch.no_grad(): # Forward pass, calculate logit predictions logits = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_masks) logits = logits[0] if self.return_probas: if not self.multilabel: pred = torch.nn.functional.softmax(logits, dim=-1) else: pred = torch.nn.functional.sigmoid(logits) pred = pred.detach().cpu().numpy() else: logits = logits.detach().cpu().numpy() pred = np.argmax(logits, axis=1) return pred @overrides def load(self, fname=None): if fname is not None: self.load_path = fname if self.pretrained_bert and not Path(self.pretrained_bert).is_file(): self.model = BertForSequenceClassification.from_pretrained( self.pretrained_bert, num_labels=self.n_classes, output_attentions=False, output_hidden_states=False) elif self.bert_config_file and Path(self.bert_config_file).is_file(): self.bert_config = BertConfig.from_json_file( str(expand_path(self.bert_config_file))) if self.attention_probs_keep_prob is not None: self.bert_config.attention_probs_dropout_prob = 1.0 - self.attention_probs_keep_prob if self.hidden_keep_prob is not None: self.bert_config.hidden_dropout_prob = 1.0 - self.hidden_keep_prob self.model = BertForSequenceClassification(config=self.bert_config) else: raise ConfigError("No pre-trained BERT model is given.") self.model.to(self.device) self.optimizer = getattr(torch.optim, self.optimizer_name)( self.model.parameters(), **self.optimizer_parameters) if self.lr_scheduler_name is not None: self.lr_scheduler = getattr(torch.optim.lr_scheduler, self.lr_scheduler_name)( self.optimizer, **self.lr_scheduler_parameters) if self.load_path: log.info(f"Load path {self.load_path} is given.") if isinstance(self.load_path, Path) and not self.load_path.parent.is_dir(): raise ConfigError("Provided load path is incorrect!") weights_path = Path(self.load_path.resolve()) weights_path = weights_path.with_suffix(f".pth.tar") if weights_path.exists(): log.info(f"Load path {weights_path} exists.") log.info( f"Initializing `{self.__class__.__name__}` from saved.") # now load the weights, optimizer from saved log.info(f"Loading weights from {weights_path}.") checkpoint = torch.load(weights_path, map_location=self.device) self.model.load_state_dict(checkpoint["model_state_dict"]) self.optimizer.load_state_dict( checkpoint["optimizer_state_dict"]) self.epochs_done = checkpoint.get("epochs_done", 0) else: log.info( f"Init from scratch. Load path {weights_path} does not exist." )
def test_model(test_data_dir): """ Use trained models to get the final prediction """ pretrained_models = ['bert-base-uncased', 'xlnet-base-cased', 'roberta-base'] # load testing data into pandas DataFrame with open(test_data_dir) as f: test_lines = [line.rstrip('\n')[line.rstrip('\n').find(',') + 1:] for line in f] test_df = pd.DataFrame(test_lines, columns=['text']) # because the model input required some label we won't use this though test_df['label'] = 1 for pretrained_model in pretrained_models: # load model if pretrained_model == 'bert-base-uncased': from transformers import BertForSequenceClassification as SequenceClassificationModel selected_epochs = bert_picks elif pretrained_model == 'xlnet-base-cased': from transformers import XLNetForSequenceClassification as SequenceClassificationModel selected_epochs = xlnet_picks elif pretrained_model == 'roberta-base': from transformers import RobertaForSequenceClassification as SequenceClassificationModel selected_epochs = roberta_picks config = AutoConfig.from_pretrained(pretrained_model) model = SequenceClassificationModel(config) # load tokenizer tokenizer = AutoTokenizer.from_pretrained(pretrained_model) init_token_idx = tokenizer.cls_token_id eos_token_idx = tokenizer.sep_token_id pad_token_idx = tokenizer.pad_token_id unk_token_idx = tokenizer.unk_token_id max_input_length = tokenizer.max_model_input_sizes[pretrained_model] def tokenize_and_cut(sentence): """ Tokenize the sentence and cut it if it's too long """ tokens = tokenizer.tokenize(sentence) # - 2 is for cls and sep tokens tokens = tokens[:max_input_length - 2] return tokens # xlnet model has no max_model_input_sizes field but it acutally has a limit # so we manually set it if max_input_length == None: max_input_length = 512 # Field handles the conversion to Tensor (tokenizing) TEXT = data.Field( batch_first=True, use_vocab=False, tokenize=tokenize_and_cut, preprocessing=tokenizer.convert_tokens_to_ids, init_token=init_token_idx, eos_token=eos_token_idx, pad_token=pad_token_idx, unk_token=unk_token_idx ) LABEL = data.LabelField(dtype=torch.long, use_vocab=False) # transform DataFrame into torchtext Dataset print('Transforming testing data for', pretrained_model, 'model') test_data = DataFrameDataset.splits(text_field=TEXT, label_field=LABEL, test_df=test_df) BATCH_SIZE = 32 # get gpu if possible device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') test_iterator = data.Iterator(test_data, batch_size=BATCH_SIZE, device=device, shuffle=False, sort=False, train=False) for selected_epoch in selected_epochs: # load trained model model.load_state_dict( torch.load(os.path.join( 'models', f'{pretrained_model}-e{selected_epoch:02}-model.pt' ), map_location=device) ) model = model.eval() # get predictions of test data print(f'Testing for {pretrained_model} epoch {selected_epoch}') predictions = test(model, test_iterator) # map predictions to match the original label_map = {0: -1, 1: 1} corrected_predictions = list(map(lambda x: label_map[x], predictions)) # load data into dataframe submission = pd.read_csv('predictions_test/sample_submission.csv') submission.Prediction = corrected_predictions submission.to_csv(os.path.join('predictions_test', f'{pretrained_model}-e{selected_epoch:02}.csv'), index=False) test_predictions('predictions_test')
import re import emoji from soynlp.normalizer import repeat_normalize finetune_ckpt = './your_local_path/BaekBERT.ckpt' test_path = '../data/testset/inferset.csv' device = 'cuda' if torch.cuda.is_available() else 'cpu' args = Arg() ckp = torch.load(finetune_ckpt, map_location=torch.device('cpu')) pretrained_model_config = BertConfig.from_pretrained( args.pretrained_model, num_labels=ckp['state_dict']['bert.classifier.bias'].shape.numel(), ) model = BertForSequenceClassification(pretrained_model_config) model.load_state_dict({k[5:]: v for k, v in ckp['state_dict'].items()}) model.to(device) model.eval() def read_data(path): if path.endswith('xlsx'): return pd.read_excel(path) elif path.endswith('csv'): return pd.read_csv(path) elif path.endswith('tsv') or path.endswith('txt'): return pd.read_csv(path, sep='\t') else: raise NotImplementedError( 'Only Excel(xlsx)/Csv/Tsv(txt) are Supported')
class init_class: def __init__(self): set_seed() self.sess = [] self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.tokenizer = BertTokenizer.from_pretrained('../user_data/vocab') for model_name in ['bert', 'rbtl']: model_config = BertConfig.from_pretrained( pretrained_model_name_or_path= "../user_data/bert_source/{}_config.json".format(model_name)) model_config.vocab_size = len( pd.read_csv('../user_data/vocab', names=["score"])) self.model = BertForSequenceClassification(config=model_config) checkpoint = torch.load( '../user_data/save_model/{}_checkpoint.pth.tar'.format( model_name), map_location='cpu') self.model.load_state_dict(checkpoint['status']) #pytorch转onnx MODEL_ONNX_PATH = "./torch_{}_dynamic.onnx".format(model_name) OPERATOR_EXPORT_TYPE = torch._C._onnx.OperatorExportTypes.ONNX self.model.eval() org_dummy_input = make_train_dummy_input() inf_dummy_input = make_inference_dummy_input() dynamic_axes = { 'input_ids': [1], 'token_type_ids': [1], 'attention_mask': [1] } output = torch.onnx.export( self.model, org_dummy_input, MODEL_ONNX_PATH, verbose=False, operator_export_type=OPERATOR_EXPORT_TYPE, opset_version=10, input_names=['input_ids', 'token_type_ids', 'attention_mask'], output_names=['output'], dynamic_axes=dynamic_axes) self.sess.append(onnxruntime.InferenceSession(MODEL_ONNX_PATH)) def __getitem__(self, text): inputs = self.tokenizer(text, return_tensors="pt") result = [] for sess in self.sess: pred_onnx = sess.run( None, { 'input_ids': inputs['input_ids'].numpy(), 'token_type_ids': inputs['token_type_ids'].numpy(), 'attention_mask': inputs['attention_mask'].numpy() }) pred_pob = torch.nn.functional.softmax(torch.tensor(pred_onnx[0]), dim=1)[:, 1] result.append(pred_pob[0].cpu().item()) return np.mean(result)
return logits def predict(inputs, model, device): sentences = inputs['texts'].values() logits = test_sentences(sentences, model, device) arrs = np.exp(logits) arrs = arrs / arrs.sum(axis=1).reshape(-1, 1) return { id: { '긍정': arr[1], '부정': arr[0] } for id, arr in zip(inputs['texts'].keys(), arrs) } with open('bertconfig200724.pkl', 'rb') as f: config = pickle.load(f) config.num_labels = 2 GPU_NUM = 0 device = torch.device( f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu') tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False) model = BertForSequenceClassification(config) model.load_state_dict(torch.load('bert200724.pt')) model.to(device)
piece=args.piece, piece_model=args.piece_model) # load bert model config = BertConfig.from_json_file(args.config_file) model = BertForSequenceClassification(config) model_state_dict = model.state_dict() print('Model parameter: {}'.format( sum(p.numel() for k, p in model_state_dict.items()))) pre_state_dict = torch.load(args.pretrained_file) pre_state_dict = { k: v for k, v in pre_state_dict.items() if k in model_state_dict } model_state_dict.update(pre_state_dict) model.load_state_dict(model_state_dict) if args.cuda: model.cuda() # load data data = BERTCLDCDataReader(args, tokenizer) # general info for cldc cldc_log = ( 'CLDC lang: {}\n'.format(', '.join(args.cldc_lang)) + 'Label size: {}\n'.format(data.label_size) + 'Labels: [{}]\n'.format( ' '.join([lb for idx, lb in data.idx2label.items()])) + 'Train percentage: {}\n'.format(args.scale) + 'Val every: {}\n'.format(args.VAL_EVERY) + 'Train size: {} [{}]\n'.format( data.train_size, ' '.join(
loss, logits = outputs[:2] optimizer.zero_grad() loss.backward() scheduler.step() optimizer.step() acc = batch_accuracy(logits, label_tensor) print('epoch:{} | acc:{} | loss:{}'.format(epoch, acc, loss)) torch.save(model.state_dict(), 'bert_cla.ckpt') print('保存训练完成的model...') # 测试 print('开始加载训练完成的model...') model.load_state_dict(torch.load('bert_cla.ckpt')) print('开始测试...') model.eval() test_result = [] for item in test_dataset: text_list = list(json.loads(item[1])) text_tensor = torch.tensor(text_list).unsqueeze(0).to(device) with torch.no_grad(): print('list', text_list) print('tensor', text_tensor) print('tensor.shape', text_tensor.shape) outputs = model(text_tensor, labels=None)
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_label = torch.tensor([f.label for f in features], dtype=torch.long) test_dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_label) bert_config = BertConfig.from_pretrained( './input/config/bert-base-chinese-config.json') bert_config.num_labels = len(processor.get_labels()) model = BertForSequenceClassification(bert_config) model.load_state_dict(torch.load('./output/best_sim.bin', map_location=device)) model = model.to(device) test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=256) total_loss = 0. # loss 的總合 total_sample_num = 0 # 樣本總數目 all_real_label = [] # 記錄所有的真實標籤list all_pred_label = [] # 記錄所有的預測標籤list for batch in tqdm(test_dataloader, desc="testing"): model.eval() batch = tuple(t.to(device) for t in batch)
config = BertConfig.from_pretrained('bert-base-chinese', resume_download=True) config.num_labels = 2 model = BertForSequenceClassification(config=config) model = BertForSequenceClassification.from_pretrained('bert-base-chinese', config=config) optimizer = AdamW(model.parameters(), lr=LR, correct_bias=False) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=WARMUP_STEPS, t_total=T_TOTAL) # optimizer = optim.Adam(model.parameters(), lr=LR) print('开始加载训练完成的model...') model.load_state_dict(torch.load('90.9847368421052632_bert_cla.ckpt')) print('开始测试...') model.eval() model = model.cuda() test_result = [] num = 0 for item in test_dataset: text_list = list(json.loads(item[1])) text_tensor = torch.tensor(text_list).unsqueeze(0).cuda() with torch.no_grad(): # print('list', text_list) # print('tensor', text_tensor) # print('tensor.shape', text_tensor.shape) outputs = model(text_tensor, labels=None) num += 1
if args.use_gpu: pin_mem = True else: pin_mem = False # Generates a dataloader on the dataset that outputs entire set as a batch for one time predictions raw_loader = torch.utils.data.DataLoader(raw_data, batch_size=args.data_batch_size, collate_fn=collate_fn, pin_memory=pin_mem) abs_model_path = Path(args.model_path) config_file = abs_model_path.parent / "config.json" config = BertConfig.from_json_file(config_file) model = BertForSequenceClassification(config) model.load_state_dict(torch.load(abs_model_path)) model.to(device) model.eval() torch.no_grad() print("Model Loaded") print(model) print("-------------------") data_logit_list = [] for batch in tqdm(raw_loader): current_logits = eval_util.calculate_batched_predictions( batch, model, device, args.target_publication) data_logit_list = data_logit_list + list(current_logits) converted_list = np.array(data_logit_list) sorted_preds = np.sort(converted_list) indices = np.argsort(converted_list)
'pos_width': f"{pos_prob * 100}%", 'neg_width': f"{neg_prob * 100}%", } # %% set hyperparameter args = ClassificationDeployArguments( pretrained_model_name="beomi/kcbert-base", downstream_model_dir="nlpbook/checkpoint-doccls", max_seq_length=128, ) # %% load model fine_tuned_model_ckpt = torch.load(args.downstream_model_checkpoint_fpath, map_location=torch.device("cuda")) pt_model_config = BertConfig.from_pretrained( args.pretrained_model_name, num_lables=fine_tuned_model_ckpt['state_dict'] ['model_classifier.bias'].shape.numel(), ) model = BertForSequenceClassification(pt_model_config) model.load_state_dict({ k.replace("model.", ""): v for k, v in fine_tuned_model_ckpt['state_dict'].items() }) model.eval() tokenizer = BertTokenizer.from_pretrained(args.pretrained_model_name, do_lower_case=False)
def get_sim_model(config_file, pre_train_model, label_num=2): bert_config = BertConfig.from_pretrained(config_file) bert_config.num_labels = label_num model = BertForSequenceClassification(bert_config) model.load_state_dict(torch.load(pre_train_model)) return model
class bert_classifier(object): def __init__(self): self.config = Config() self.device_setup() self.model_setup() def device_setup(self): """ 设备配置并加载BERT模型 :return: """ # 使用GPU,通过model.to(device)的方式使用 self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") model_save_path = self.config.get("result", "model_save_path") config_save_path = self.config.get("result", "config_save_path") vocab_save_path = self.config.get("result", "vocab_save_path") self.model_config = BertConfig.from_json_file(config_save_path) self.model = BertForSequenceClassification(self.model_config) self.state_dict = torch.load(model_save_path) self.model.load_state_dict(self.state_dict) self.tokenizer = transformers.BertTokenizer(vocab_save_path) self.model.to(self.device) self.model.eval() def model_setup(self): weight_decay = self.config.get("training_rule", "weight_decay") learning_rate = self.config.get("training_rule", "learning_rate") # 定义优化器和损失函数 # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] self.optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate) self.criterion = nn.CrossEntropyLoss() def predict(self, sentence): input_ids, token_type_ids = convert_text_to_ids( self.tokenizer, sentence) input_ids = seq_padding(self.tokenizer, [input_ids]) token_type_ids = seq_padding(self.tokenizer, [token_type_ids]) # 需要 LongTensor input_ids, token_type_ids = input_ids.long(), token_type_ids.long() # 梯度清零 self.optimizer.zero_grad() # 迁移到GPU input_ids, token_type_ids = input_ids.to( self.device), token_type_ids.to(self.device) output = self.model(input_ids=input_ids, token_type_ids=token_type_ids) y_pred_prob = output[0] y_pred_label = y_pred_prob.argmax(dim=1) print(y_pred_label)
'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr) if args.mode == 'test': test = FeverDataset(args.test_dataset) # print(Counter([_x['label'] for _x in test]).most_common(3)) test_dl = BucketBatchSampler(batch_size=args.batch_size, sort_key=sort_key, dataset=test, collate_fn=collate_fn) checkpoint = torch.load(args.model_path) model.load_state_dict(checkpoint['model']) print(eval_model(model, test_dl)) else: print("Loading datasets...") train = FeverDataset(args.train_dataset) dev = FeverDataset(args.dev_dataset) # print(Counter([_x['label'] for _x in train]).most_common(3)) # print(Counter([_x['label'] for _x in dev]).most_common(3)) train_dl = BucketBatchSampler(batch_size=args.batch_size, sort_key=sort_key, dataset=train, collate_fn=collate_fn) dev_dl = BucketBatchSampler(batch_size=args.batch_size,
class BERT(): def __init__(self, model_path=None, config=None): #self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.device = torch.device("cpu") # load tokenizer self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) # load model configuration if config is None: config = BertConfig() # path to save model file if model_path is None: base_dir = os.path.dirname(os.path.realpath(__file__)) model_dir = os.path.join(base_dir, '.models') os.makedirs(model_dir, exist_ok=True) url = "https://www.dropbox.com/s/jw18aln9rmg69d6/BERT_Weights.pt?dl=0" model_name = os.path.split(url)[-1][:-5] model_path = os.path.join(model_dir, model_name) # download model if not os.path.exists(model_path): subprocess.call(['wget', url, '-O', model_path]) # load pre-trained model self.model = BertForSequenceClassification(config) self.model.load_state_dict(torch.load(model_path, map_location=self.device)) def preprocess(self, text): # encode text input_encoded = self.tokenizer.encode_plus( text, add_special_tokens = True, max_length = 64, truncation = True, pad_to_max_length = True, return_attention_mask = True, return_tensors = 'pt', ) # setup BERT parameters input_ids = input_encoded["input_ids"] attention_mask = input_encoded["attention_mask"] # prepare dataset pred_data = TensorDataset(input_ids, attention_mask) # sample dataset pred_sampler = SequentialSampler(pred_data) # prepare dataloader pred_dl = DataLoader(pred_data, sampler = pred_sampler, batch_size = 1) return pred_dl @torch.no_grad() def predict(self, pred_dl): for s, b in enumerate(pred_dl): # get batch b = tuple(t.to(self.device) for t in b) # get BERT parameters input_idsx, attention_maskx = b # predict outs = self.model( input_ids = input_idsx, attention_mask = attention_maskx, token_type_ids = None, ) # predictions logits = outs[0] logits = logits.detach().cpu().numpy() logits = np.argmax(logits, axis=-1).item() return logits