def load(args, checkpoint_dir):
    state_dict = torch.load(os.path.join(checkpoint_dir, 'checkpoint.pth'))
    from collections import OrderedDict
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        if 'module' in k:
            namekey = k[7:]  # remove `module.`
        else:
            namekey = k
        new_state_dict[namekey] = v

    if args.model_type == 'bert':
        config = BertConfig.from_json_file(os.path.join(checkpoint_dir, 'config.bin'))
        model = BertForSequenceClassification(config)
        model.load_state_dict(new_state_dict)
    elif args.model_type == 'cnn':
        model = CNNModel(n_vocab=args.vocab_size, embed_size=args.embed_size, num_classes=args.num_labels,
                         num_filters=args.num_filters, filter_sizes=args.filter_sizes, device=args.device)
        model.load_state_dict(new_state_dict)
    elif args.model_type == 'lstm':
        model = LSTMModel(n_vocab=args.vocab_size, embed_size=args.embed_size, num_classes=args.num_labels,
                          hidden_size=args.hidden_size, device=args.device)
        model.load_state_dict(new_state_dict)
    elif args.model_type == 'char-cnn':
        model = CharCNN(num_features=args.num_features, num_classes=args.num_labels)
        model.load_state_dict(new_state_dict)
    else:
        raise ValueError('model type is not found!')

    return model.to(args.device)
コード例 #2
0
def main(args):
    """
    주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다.
    """
    seed_everything(args.seed)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    # load tokenizer
    TOK_NAME = args.token
    if TOK_NAME == "monologg/kobert":
        tokenizer = KoBertTokenizer.from_pretrained(TOK_NAME)
    else:
        tokenizer = AutoTokenizer.from_pretrained(TOK_NAME)

    # load my model
    bert_config = BertConfig.from_pretrained(TOK_NAME)
    bert_config.num_labels = args.num_labels
    bert_config.num_hidden_layers = args.num_hidden_layers
    model = BertForSequenceClassification(bert_config)

    model_dir = os.path.join(args.model_dir, args.name)
    model_path = os.path.join(model_dir, 'best.pth')

    # load test datset
    test_dataset_dir = "/opt/ml/input/data/test/test.tsv"
    test_dataset, test_label = load_test_dataset(test_dataset_dir, model,
                                                 tokenizer, args)
    test_dataset = RE_Dataset(test_dataset, test_label)

    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)

    # predict answer
    batch_size = args.batch_size
    print("Inference Start!!!")
    pred_answer = inference(model, test_dataset, device, batch_size)
    # make csv file with predicted answer
    # 아래 directory와 columns의 형태는 지켜주시기 바랍니다.

    output = pd.DataFrame(pred_answer, columns=['pred'])
    save_dir = os.path.join(args.output_dir, args.name)
    os.makedirs(save_dir, exist_ok=True)
    output.to_csv(os.path.join(save_dir, f'{args.name}.csv'), index=False)
コード例 #3
0
def model_infer(config,test_load,k):
    
    print("***********load model weight*****************")

    model_config = model_config = BertConfig()
    model_config.vocab_size = len(pd.read_csv('../user_data/vocab',names=["score"]))
    
    model = BertForSequenceClassification(config=model_config)
    model.load_state_dict(torch.load('../user_data/save_model/{}_best_model.pth.tar'.format(config.model_name))['status'])
    model = model.to(config.device)

    print("***********make predict for test file*****************")

    
    model.eval()
    predict_all = []

    with torch.no_grad():
        for batch, (input_ids, token_type_ids, attention_mask, label) in enumerate(test_load):
            input_ids = input_ids.to(config.device)
            attention_mask = attention_mask.to(config.device)
            token_type_ids = token_type_ids.to(config.device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask,
                            token_type_ids=token_type_ids)

            logits = outputs.logits
            pred_pob = torch.nn.functional.softmax(logits, dim=1)[:, 1]
            predict_all.extend(list(pred_pob.detach().cpu().numpy()))
    
#     submit_result(predict)
    if k==0:
        df=pd.DataFrame(predict_all,columns=["{}_socre".format(k+1)])
        df.to_csv('./{}_result.csv'.format(config.model_name),index=False)
    else:
        df=pd.read_csv('./{}_result.csv'.format(config.model_name))
        df["{}_socre".format(k+1)] = predict_all
        df.to_csv('./{}_result.csv'.format(config.model_name),index=False)
    
    print("***********done*****************")
def load(args, checkpoint_dir):
    state_dict = torch.load(os.path.join(checkpoint_dir, 'checkpoint.pth'))
    from collections import OrderedDict
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        if 'module' in k:
            namekey = k[7:]  # remove `module.`
        else:
            namekey = k
        new_state_dict[namekey] = v

    if args.model_type == 'bert':
        config = BertConfig.from_json_file(
            os.path.join(checkpoint_dir, 'config.bin'))
        model = BertForSequenceClassification(config)
        model.load_state_dict(new_state_dict)
    elif args.model_type == 'bow':
        model = BOWModel(new_state_dict['embedding.weight'],
                         n_vocab=args.vocab_size,
                         embed_size=args.embed_size,
                         hidden_size=args.hidden_size,
                         num_classes=args.num_labels)
        model.load_state_dict(new_state_dict)
    elif args.model_type == 'decom_att':
        model = DecompAttentionModel(args.word_mat,
                                     n_vocab=args.vocab_size,
                                     embed_size=args.embed_size,
                                     hidden_size=args.hidden_size,
                                     num_classes=args.num_labels)
        model.load_state_dict(new_state_dict)
    elif args.model_type == 'esim':
        model = ESIM(vocab_size=args.vocab_size,
                     embedding_dim=args.embed_size,
                     hidden_size=args.hidden_size,
                     embeddings=None,
                     padding_idx=0,
                     dropout=0.1,
                     num_classes=args.num_labels,
                     device=args.device)
        model.load_state_dict(new_state_dict)
    else:
        raise ValueError('model type is not found!')

    return model.to(args.device)
コード例 #5
0
ファイル: infer.py プロジェクト: tanshoudong/NLP-program
    label = torch.tensor(data=label).type(torch.LongTensor)
    return input_ids, token_type_ids, attention_mask, label


print("***********load test data*****************")

config = roBerta_Config()
vocab = Vocab()
train_data, valid_data, test_data = vocab.get_train_dev_test()
test_dataset = BuildDataSet(test_data)
test_load = DataLoader(dataset=test_dataset,
                       batch_size=config.batch_size,
                       shuffle=False,
                       collate_fn=collate_fn)

print("***********load model weight*****************")

model_config = BertConfig.from_pretrained(
    pretrained_model_name_or_path="bert_source/bert_config.json")
model = BertForSequenceClassification(config=model_config)
model.load_state_dict(torch.load('save_bert/best_model.pth.tar'))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
config.device = device

print("***********make predict for test file*****************")

predict = model_infer(model, config, test_load)
submit_result(predict)
print("***********done*****************")
コード例 #6
0
def train_process(config, train_load, train_sampler, model_name):
    # load source bert weights
    model_config = BertConfig.from_pretrained(
        pretrained_model_name_or_path="../user_data/bert_source/{}_config.json"
        .format(model_name))
    # model_config = BertConfig()
    model_config.vocab_size = len(
        pd.read_csv('../user_data/vocab', names=["score"]))
    model = BertForSequenceClassification(config=model_config)

    checkpoint = torch.load(
        '../user_data/save_bert/{}_checkpoint.pth.tar'.format(model_name),
        map_location=torch.device('cpu'))
    model.load_state_dict(checkpoint['status'], strict=False)
    print('***********load pretrained mlm {} weight*************'.format(
        model_name))

    for param in model.parameters():
        param.requires_grad = True

    # 4) 封装之前要把模型移到对应的gpu
    model = model.to(config.device)

    no_decay = ["bias", "LayerNorm.weight"]

    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            config.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate)

    #     t_total = len(train_load) * config.num_train_epochs
    #     scheduler = get_linear_schedule_with_warmup(
    #         optimizer, num_warmup_steps=t_total * config.warmup_proportion, num_training_steps=t_total
    #     )

    cudnn.benchmark = True

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        # 5)封装
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[config.local_rank])

    model.train()
    if config.fgm:
        fgm = FGM(model)

    for epoch in range(config.num_train_epochs):
        train_sampler.set_epoch(epoch)
        torch.cuda.empty_cache()

        for batch, (input_ids, token_type_ids, attention_mask,
                    label) in enumerate(train_load):
            input_ids = input_ids.cuda(config.local_rank, non_blocking=True)
            attention_mask = attention_mask.cuda(config.local_rank,
                                                 non_blocking=True)
            token_type_ids = token_type_ids.cuda(config.local_rank,
                                                 non_blocking=True)
            label = label.cuda(config.local_rank, non_blocking=True)

            outputs = model(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            labels=label)

            loss = outputs.loss
            model.zero_grad()
            loss.backward()
            #             torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)

            if config.fgm:
                fgm.attack()  # 在embedding上添加对抗扰动
                loss_adv = model(input_ids=input_ids,
                                 attention_mask=attention_mask,
                                 token_type_ids=token_type_ids,
                                 labels=label).loss
                loss_adv.backward()  # 反向传播,并在正常的grad基础上,累加对抗训练的梯度
                fgm.restore()  # 恢复embedding参数

            optimizer.step()
        #             scheduler.step()

        # dev_auc = model_evaluate(config, model, valid_load)

        # 同步各个进程的速度,计算分布式loss
        torch.distributed.barrier()
        # reduce_dev_auc = reduce_auc(dev_auc, config.nprocs).item()

        # if reduce_dev_auc > best_dev_auc:
        #     best_dev_auc = reduce_dev_auc
        #     is_best = True

        now = strftime("%Y-%m-%d %H:%M:%S", localtime())
        msg = 'model_name:{},time:{},epoch:{}/{}'

        if config.local_rank in [0, -1]:
            print(
                msg.format(model_name, now, epoch + 1,
                           config.num_train_epochs))
            checkpoint = {"status": model.module.state_dict()}
            torch.save(
                checkpoint, '../user_data/save_model' + os.sep +
                '{}_checkpoint.pth.tar'.format(model_name))
            del checkpoint

    torch.distributed.barrier()
コード例 #7
0
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_attention_mask = torch.tensor([f.attention_mask for f in features],
                                  dtype=torch.long)
all_token_type_ids = torch.tensor([f.token_type_ids for f in features],
                                  dtype=torch.long)
all_label = torch.tensor([f.label for f in features], dtype=torch.long)
test_dataset = TensorDataset(all_input_ids, all_attention_mask,
                             all_token_type_ids, all_label)

bert_config = BertConfig.from_pretrained(
    './input/config/bert-base-chinese-config.json')
bert_config.num_labels = len(processor.get_labels())

model = BertForSequenceClassification(bert_config)
model.load_state_dict(torch.load('./output/best_sim.bin'))
model = model.to(device)

test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset,
                             sampler=test_sampler,
                             batch_size=256)

total_loss = 0.  # loss 的总和
total_sample_num = 0  # 样本总数目
all_real_label = []  # 记录所有的真实标签列表
all_pred_label = []  # 记录所有的预测标签列表

for batch in tqdm(test_dataloader, desc="testing"):
    model.eval()
    batch = tuple(t.to(device) for t in batch)
コード例 #8
0
class TorchBertClassifierModel(TorchModel):
    """Bert-based model for text classification on PyTorch.

    It uses output from [CLS] token and predicts labels using linear transformation.

    Args:
        n_classes: number of classes
        pretrained_bert: pretrained Bert checkpoint path or key title (e.g. "bert-base-uncased")
        one_hot_labels: set True if one-hot encoding for labels is used
        multilabel: set True if it is multi-label classification
        return_probas: set True if return class probabilites instead of most probable label needed
        attention_probs_keep_prob: keep_prob for Bert self-attention layers
        hidden_keep_prob: keep_prob for Bert hidden layers
        optimizer: optimizer name from `torch.optim`
        optimizer_parameters: dictionary with optimizer's parameters,
                              e.g. {'lr': 0.1, 'weight_decay': 0.001, 'momentum': 0.9}
        clip_norm: clip gradients by norm coefficient
        bert_config_file: path to Bert configuration file (not used if pretrained_bert is key title)
    """
    def __init__(self,
                 n_classes,
                 pretrained_bert,
                 one_hot_labels: bool = False,
                 multilabel: bool = False,
                 return_probas: bool = False,
                 attention_probs_keep_prob: Optional[float] = None,
                 hidden_keep_prob: Optional[float] = None,
                 optimizer: str = "AdamW",
                 optimizer_parameters: dict = {
                     "lr": 1e-3,
                     "weight_decay": 0.01,
                     "betas": (0.9, 0.999),
                     "eps": 1e-6
                 },
                 clip_norm: Optional[float] = None,
                 bert_config_file: Optional[str] = None,
                 **kwargs) -> None:

        self.return_probas = return_probas
        self.one_hot_labels = one_hot_labels
        self.multilabel = multilabel
        self.pretrained_bert = pretrained_bert
        self.bert_config_file = bert_config_file
        self.attention_probs_keep_prob = attention_probs_keep_prob
        self.hidden_keep_prob = hidden_keep_prob
        self.n_classes = n_classes
        self.clip_norm = clip_norm

        if self.multilabel and not self.one_hot_labels:
            raise RuntimeError(
                'Use one-hot encoded labels for multilabel classification!')

        if self.multilabel and not self.return_probas:
            raise RuntimeError(
                'Set return_probas to True for multilabel classification!')

        super().__init__(optimizer=optimizer,
                         optimizer_parameters=optimizer_parameters,
                         **kwargs)

    def train_on_batch(self, features: List[InputFeatures],
                       y: Union[List[int], List[List[int]]]) -> Dict:
        """Train model on given batch.
        This method calls train_op using features and y (labels).

        Args:
            features: batch of InputFeatures
            y: batch of labels (class id or one-hot encoding)

        Returns:
            dict with loss and learning_rate values
        """
        input_ids = [f.input_ids for f in features]
        input_masks = [f.attention_mask for f in features]

        b_input_ids = torch.cat(input_ids, dim=0).to(self.device)
        b_input_masks = torch.cat(input_masks, dim=0).to(self.device)
        b_labels = torch.from_numpy(np.array(y)).to(self.device)

        self.optimizer.zero_grad()

        loss, logits = self.model(b_input_ids,
                                  token_type_ids=None,
                                  attention_mask=b_input_masks,
                                  labels=b_labels)
        loss.backward()
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        if self.clip_norm:
            torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                           self.clip_norm)

        self.optimizer.step()
        if self.lr_scheduler is not None:
            self.lr_scheduler.step()

        return {'loss': loss.item()}

    def __call__(
            self, features: List[InputFeatures]
    ) -> Union[List[int], List[List[float]]]:
        """Make prediction for given features (texts).

        Args:
            features: batch of InputFeatures

        Returns:
            predicted classes or probabilities of each class

        """
        input_ids = [f.input_ids for f in features]
        input_masks = [f.attention_mask for f in features]

        b_input_ids = torch.cat(input_ids, dim=0).to(self.device)
        b_input_masks = torch.cat(input_masks, dim=0).to(self.device)

        with torch.no_grad():
            # Forward pass, calculate logit predictions
            logits = self.model(b_input_ids,
                                token_type_ids=None,
                                attention_mask=b_input_masks)
            logits = logits[0]

        if self.return_probas:
            if not self.multilabel:
                pred = torch.nn.functional.softmax(logits, dim=-1)
            else:
                pred = torch.nn.functional.sigmoid(logits)
            pred = pred.detach().cpu().numpy()
        else:
            logits = logits.detach().cpu().numpy()
            pred = np.argmax(logits, axis=1)
        return pred

    @overrides
    def load(self, fname=None):
        if fname is not None:
            self.load_path = fname

        if self.pretrained_bert and not Path(self.pretrained_bert).is_file():
            self.model = BertForSequenceClassification.from_pretrained(
                self.pretrained_bert,
                num_labels=self.n_classes,
                output_attentions=False,
                output_hidden_states=False)
        elif self.bert_config_file and Path(self.bert_config_file).is_file():
            self.bert_config = BertConfig.from_json_file(
                str(expand_path(self.bert_config_file)))

            if self.attention_probs_keep_prob is not None:
                self.bert_config.attention_probs_dropout_prob = 1.0 - self.attention_probs_keep_prob
            if self.hidden_keep_prob is not None:
                self.bert_config.hidden_dropout_prob = 1.0 - self.hidden_keep_prob
            self.model = BertForSequenceClassification(config=self.bert_config)
        else:
            raise ConfigError("No pre-trained BERT model is given.")

        self.model.to(self.device)

        self.optimizer = getattr(torch.optim, self.optimizer_name)(
            self.model.parameters(), **self.optimizer_parameters)
        if self.lr_scheduler_name is not None:
            self.lr_scheduler = getattr(torch.optim.lr_scheduler,
                                        self.lr_scheduler_name)(
                                            self.optimizer,
                                            **self.lr_scheduler_parameters)

        if self.load_path:
            log.info(f"Load path {self.load_path} is given.")
            if isinstance(self.load_path,
                          Path) and not self.load_path.parent.is_dir():
                raise ConfigError("Provided load path is incorrect!")

            weights_path = Path(self.load_path.resolve())
            weights_path = weights_path.with_suffix(f".pth.tar")
            if weights_path.exists():
                log.info(f"Load path {weights_path} exists.")
                log.info(
                    f"Initializing `{self.__class__.__name__}` from saved.")

                # now load the weights, optimizer from saved
                log.info(f"Loading weights from {weights_path}.")
                checkpoint = torch.load(weights_path, map_location=self.device)
                self.model.load_state_dict(checkpoint["model_state_dict"])
                self.optimizer.load_state_dict(
                    checkpoint["optimizer_state_dict"])
                self.epochs_done = checkpoint.get("epochs_done", 0)
            else:
                log.info(
                    f"Init from scratch. Load path {weights_path} does not exist."
                )
コード例 #9
0
def test_model(test_data_dir):
    """ Use trained models to get the final prediction """
    pretrained_models = ['bert-base-uncased', 'xlnet-base-cased', 'roberta-base']
    # load testing data into pandas DataFrame
    with open(test_data_dir) as f:
        test_lines = [line.rstrip('\n')[line.rstrip('\n').find(',') + 1:] for line in f]

    test_df = pd.DataFrame(test_lines, columns=['text'])
    # because the model input required some label we won't use this though
    test_df['label'] = 1

    for pretrained_model in pretrained_models:
        # load model
        if pretrained_model == 'bert-base-uncased':
            from transformers import BertForSequenceClassification as SequenceClassificationModel
            selected_epochs = bert_picks
        elif pretrained_model == 'xlnet-base-cased':
            from transformers import XLNetForSequenceClassification as SequenceClassificationModel
            selected_epochs = xlnet_picks
        elif pretrained_model == 'roberta-base':
            from transformers import RobertaForSequenceClassification as SequenceClassificationModel
            selected_epochs = roberta_picks

        config = AutoConfig.from_pretrained(pretrained_model)
        model = SequenceClassificationModel(config)

        # load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
        init_token_idx = tokenizer.cls_token_id
        eos_token_idx = tokenizer.sep_token_id
        pad_token_idx = tokenizer.pad_token_id
        unk_token_idx = tokenizer.unk_token_id

        max_input_length = tokenizer.max_model_input_sizes[pretrained_model]

        def tokenize_and_cut(sentence):
            """ Tokenize the sentence and cut it if it's too long """
            tokens = tokenizer.tokenize(sentence)
            # - 2 is for cls and sep tokens
            tokens = tokens[:max_input_length - 2]
            return tokens

        # xlnet model has no max_model_input_sizes field but it acutally has a limit
        # so we manually set it
        if max_input_length == None:
            max_input_length = 512

        # Field handles the conversion to Tensor (tokenizing)
        TEXT = data.Field(
            batch_first=True,
            use_vocab=False,
            tokenize=tokenize_and_cut,
            preprocessing=tokenizer.convert_tokens_to_ids,
            init_token=init_token_idx,
            eos_token=eos_token_idx,
            pad_token=pad_token_idx,
            unk_token=unk_token_idx
        )

        LABEL = data.LabelField(dtype=torch.long, use_vocab=False)

        # transform DataFrame into torchtext Dataset
        print('Transforming testing data for', pretrained_model, 'model')
        test_data = DataFrameDataset.splits(text_field=TEXT, label_field=LABEL, test_df=test_df)

        BATCH_SIZE = 32
        # get gpu if possible
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        test_iterator = data.Iterator(test_data, batch_size=BATCH_SIZE, device=device, shuffle=False, sort=False, train=False)

        for selected_epoch in selected_epochs:
            # load trained model
            model.load_state_dict(
                torch.load(os.path.join(
                    'models',
                    f'{pretrained_model}-e{selected_epoch:02}-model.pt'
                ), map_location=device)
            )
            model = model.eval()

            # get predictions of test data
            print(f'Testing for {pretrained_model} epoch {selected_epoch}')
            predictions = test(model, test_iterator)

            # map predictions to match the original
            label_map = {0: -1, 1: 1}
            corrected_predictions = list(map(lambda x: label_map[x], predictions))

            # load data into dataframe
            submission = pd.read_csv('predictions_test/sample_submission.csv')
            submission.Prediction = corrected_predictions
            submission.to_csv(os.path.join('predictions_test', f'{pretrained_model}-e{selected_epoch:02}.csv'), index=False)

    test_predictions('predictions_test')
コード例 #10
0
import re
import emoji
from soynlp.normalizer import repeat_normalize

finetune_ckpt = './your_local_path/BaekBERT.ckpt'
test_path = '../data/testset/inferset.csv'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
args = Arg()

ckp = torch.load(finetune_ckpt, map_location=torch.device('cpu'))
pretrained_model_config = BertConfig.from_pretrained(
    args.pretrained_model,
    num_labels=ckp['state_dict']['bert.classifier.bias'].shape.numel(),
)
model = BertForSequenceClassification(pretrained_model_config)
model.load_state_dict({k[5:]: v for k, v in ckp['state_dict'].items()})
model.to(device)
model.eval()


def read_data(path):
    if path.endswith('xlsx'):
        return pd.read_excel(path)
    elif path.endswith('csv'):
        return pd.read_csv(path)
    elif path.endswith('tsv') or path.endswith('txt'):
        return pd.read_csv(path, sep='\t')
    else:
        raise NotImplementedError(
            'Only Excel(xlsx)/Csv/Tsv(txt) are Supported')
コード例 #11
0
class init_class:
    def __init__(self):
        set_seed()
        self.sess = []

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = BertTokenizer.from_pretrained('../user_data/vocab')

        for model_name in ['bert', 'rbtl']:
            model_config = BertConfig.from_pretrained(
                pretrained_model_name_or_path=
                "../user_data/bert_source/{}_config.json".format(model_name))
            model_config.vocab_size = len(
                pd.read_csv('../user_data/vocab', names=["score"]))

            self.model = BertForSequenceClassification(config=model_config)
            checkpoint = torch.load(
                '../user_data/save_model/{}_checkpoint.pth.tar'.format(
                    model_name),
                map_location='cpu')
            self.model.load_state_dict(checkpoint['status'])

            #pytorch转onnx
            MODEL_ONNX_PATH = "./torch_{}_dynamic.onnx".format(model_name)
            OPERATOR_EXPORT_TYPE = torch._C._onnx.OperatorExportTypes.ONNX
            self.model.eval()
            org_dummy_input = make_train_dummy_input()
            inf_dummy_input = make_inference_dummy_input()
            dynamic_axes = {
                'input_ids': [1],
                'token_type_ids': [1],
                'attention_mask': [1]
            }
            output = torch.onnx.export(
                self.model,
                org_dummy_input,
                MODEL_ONNX_PATH,
                verbose=False,
                operator_export_type=OPERATOR_EXPORT_TYPE,
                opset_version=10,
                input_names=['input_ids', 'token_type_ids', 'attention_mask'],
                output_names=['output'],
                dynamic_axes=dynamic_axes)

            self.sess.append(onnxruntime.InferenceSession(MODEL_ONNX_PATH))

    def __getitem__(self, text):
        inputs = self.tokenizer(text, return_tensors="pt")
        result = []
        for sess in self.sess:
            pred_onnx = sess.run(
                None, {
                    'input_ids': inputs['input_ids'].numpy(),
                    'token_type_ids': inputs['token_type_ids'].numpy(),
                    'attention_mask': inputs['attention_mask'].numpy()
                })

            pred_pob = torch.nn.functional.softmax(torch.tensor(pred_onnx[0]),
                                                   dim=1)[:, 1]

            result.append(pred_pob[0].cpu().item())
        return np.mean(result)
コード例 #12
0
ファイル: bert.py プロジェクト: jinmang2/HomunculusInFlask
    return logits


def predict(inputs, model, device):
    sentences = inputs['texts'].values()
    logits = test_sentences(sentences, model, device)
    arrs = np.exp(logits)
    arrs = arrs / arrs.sum(axis=1).reshape(-1, 1)
    return {
        id: {
            '긍정': arr[1],
            '부정': arr[0]
        }
        for id, arr in zip(inputs['texts'].keys(), arrs)
    }


with open('bertconfig200724.pkl', 'rb') as f:
    config = pickle.load(f)

config.num_labels = 2

GPU_NUM = 0
device = torch.device(
    f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased',
                                          do_lower_case=False)
model = BertForSequenceClassification(config)
model.load_state_dict(torch.load('bert200724.pt'))
model.to(device)
コード例 #13
0
                                           piece=args.piece,
                                           piece_model=args.piece_model)

    # load bert model
    config = BertConfig.from_json_file(args.config_file)
    model = BertForSequenceClassification(config)
    model_state_dict = model.state_dict()
    print('Model parameter: {}'.format(
        sum(p.numel() for k, p in model_state_dict.items())))
    pre_state_dict = torch.load(args.pretrained_file)
    pre_state_dict = {
        k: v
        for k, v in pre_state_dict.items() if k in model_state_dict
    }
    model_state_dict.update(pre_state_dict)
    model.load_state_dict(model_state_dict)
    if args.cuda:
        model.cuda()

    # load data
    data = BERTCLDCDataReader(args, tokenizer)

    # general info for cldc
    cldc_log = (
        'CLDC lang: {}\n'.format(', '.join(args.cldc_lang)) +
        'Label size: {}\n'.format(data.label_size) + 'Labels: [{}]\n'.format(
            ' '.join([lb for idx, lb in data.idx2label.items()])) +
        'Train percentage: {}\n'.format(args.scale) +
        'Val every: {}\n'.format(args.VAL_EVERY) +
        'Train size: {} [{}]\n'.format(
            data.train_size, ' '.join(
コード例 #14
0
            loss, logits = outputs[:2]
            optimizer.zero_grad()
            loss.backward()
            scheduler.step()
            optimizer.step()

            acc = batch_accuracy(logits, label_tensor)
            print('epoch:{} | acc:{} | loss:{}'.format(epoch, acc, loss))

    torch.save(model.state_dict(), 'bert_cla.ckpt')
    print('保存训练完成的model...')

    # 测试

    print('开始加载训练完成的model...')
    model.load_state_dict(torch.load('bert_cla.ckpt'))

    print('开始测试...')
    model.eval()
    test_result = []
    for item in test_dataset:

        text_list = list(json.loads(item[1]))
        text_tensor = torch.tensor(text_list).unsqueeze(0).to(device)

        with torch.no_grad():

            print('list', text_list)
            print('tensor', text_tensor)
            print('tensor.shape', text_tensor.shape)
            outputs = model(text_tensor, labels=None)
コード例 #15
0
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_attention_mask = torch.tensor([f.attention_mask for f in features],
                                  dtype=torch.long)
all_token_type_ids = torch.tensor([f.token_type_ids for f in features],
                                  dtype=torch.long)
all_label = torch.tensor([f.label for f in features], dtype=torch.long)
test_dataset = TensorDataset(all_input_ids, all_attention_mask,
                             all_token_type_ids, all_label)

bert_config = BertConfig.from_pretrained(
    './input/config/bert-base-chinese-config.json')
bert_config.num_labels = len(processor.get_labels())

model = BertForSequenceClassification(bert_config)
model.load_state_dict(torch.load('./output/best_sim.bin', map_location=device))
model = model.to(device)

test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset,
                             sampler=test_sampler,
                             batch_size=256)

total_loss = 0.  # loss 的總合
total_sample_num = 0  # 樣本總數目
all_real_label = []  # 記錄所有的真實標籤list
all_pred_label = []  # 記錄所有的預測標籤list

for batch in tqdm(test_dataloader, desc="testing"):
    model.eval()
    batch = tuple(t.to(device) for t in batch)
コード例 #16
0
    config = BertConfig.from_pretrained('bert-base-chinese',
                                        resume_download=True)
    config.num_labels = 2
    model = BertForSequenceClassification(config=config)
    model = BertForSequenceClassification.from_pretrained('bert-base-chinese',
                                                          config=config)

    optimizer = AdamW(model.parameters(), lr=LR, correct_bias=False)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=WARMUP_STEPS,
                                     t_total=T_TOTAL)

    # optimizer = optim.Adam(model.parameters(), lr=LR)

    print('开始加载训练完成的model...')
    model.load_state_dict(torch.load('90.9847368421052632_bert_cla.ckpt'))
    print('开始测试...')
    model.eval()
    model = model.cuda()
    test_result = []
    num = 0
    for item in test_dataset:
        text_list = list(json.loads(item[1]))
        text_tensor = torch.tensor(text_list).unsqueeze(0).cuda()

        with torch.no_grad():
            # print('list', text_list)
            # print('tensor', text_tensor)
            # print('tensor.shape', text_tensor.shape)
            outputs = model(text_tensor, labels=None)
            num += 1
コード例 #17
0
if args.use_gpu:
    pin_mem = True
else:
    pin_mem = False

# Generates a dataloader on the dataset that outputs entire set as a batch for one time predictions
raw_loader = torch.utils.data.DataLoader(raw_data,
                                         batch_size=args.data_batch_size,
                                         collate_fn=collate_fn,
                                         pin_memory=pin_mem)

abs_model_path = Path(args.model_path)
config_file = abs_model_path.parent / "config.json"
config = BertConfig.from_json_file(config_file)
model = BertForSequenceClassification(config)
model.load_state_dict(torch.load(abs_model_path))
model.to(device)
model.eval()
torch.no_grad()
print("Model Loaded")
print(model)
print("-------------------")

data_logit_list = []
for batch in tqdm(raw_loader):
    current_logits = eval_util.calculate_batched_predictions(
        batch, model, device, args.target_publication)
    data_logit_list = data_logit_list + list(current_logits)
converted_list = np.array(data_logit_list)
sorted_preds = np.sort(converted_list)
indices = np.argsort(converted_list)
コード例 #18
0
        'pos_width': f"{pos_prob * 100}%",
        'neg_width': f"{neg_prob * 100}%",
    }


# %% set hyperparameter
args = ClassificationDeployArguments(
    pretrained_model_name="beomi/kcbert-base",
    downstream_model_dir="nlpbook/checkpoint-doccls",
    max_seq_length=128,
)

# %% load model
fine_tuned_model_ckpt = torch.load(args.downstream_model_checkpoint_fpath,
                                   map_location=torch.device("cuda"))

pt_model_config = BertConfig.from_pretrained(
    args.pretrained_model_name,
    num_lables=fine_tuned_model_ckpt['state_dict']
    ['model_classifier.bias'].shape.numel(),
)

model = BertForSequenceClassification(pt_model_config)
model.load_state_dict({
    k.replace("model.", ""): v
    for k, v in fine_tuned_model_ckpt['state_dict'].items()
})
model.eval()

tokenizer = BertTokenizer.from_pretrained(args.pretrained_model_name,
                                          do_lower_case=False)
コード例 #19
0
def get_sim_model(config_file, pre_train_model, label_num=2):
    bert_config = BertConfig.from_pretrained(config_file)
    bert_config.num_labels = label_num
    model = BertForSequenceClassification(bert_config)
    model.load_state_dict(torch.load(pre_train_model))
    return model
コード例 #20
0
class bert_classifier(object):
    def __init__(self):
        self.config = Config()
        self.device_setup()
        self.model_setup()

    def device_setup(self):
        """
        设备配置并加载BERT模型
        :return:
        """

        # 使用GPU,通过model.to(device)的方式使用
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        model_save_path = self.config.get("result", "model_save_path")
        config_save_path = self.config.get("result", "config_save_path")
        vocab_save_path = self.config.get("result", "vocab_save_path")

        self.model_config = BertConfig.from_json_file(config_save_path)
        self.model = BertForSequenceClassification(self.model_config)
        self.state_dict = torch.load(model_save_path)
        self.model.load_state_dict(self.state_dict)
        self.tokenizer = transformers.BertTokenizer(vocab_save_path)
        self.model.to(self.device)
        self.model.eval()

    def model_setup(self):
        weight_decay = self.config.get("training_rule", "weight_decay")
        learning_rate = self.config.get("training_rule", "learning_rate")

        # 定义优化器和损失函数
        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in self.model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            weight_decay
        }, {
            'params': [
                p for n, p in self.model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        self.optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
        self.criterion = nn.CrossEntropyLoss()

    def predict(self, sentence):
        input_ids, token_type_ids = convert_text_to_ids(
            self.tokenizer, sentence)
        input_ids = seq_padding(self.tokenizer, [input_ids])
        token_type_ids = seq_padding(self.tokenizer, [token_type_ids])
        # 需要 LongTensor
        input_ids, token_type_ids = input_ids.long(), token_type_ids.long()
        # 梯度清零
        self.optimizer.zero_grad()
        # 迁移到GPU
        input_ids, token_type_ids = input_ids.to(
            self.device), token_type_ids.to(self.device)
        output = self.model(input_ids=input_ids, token_type_ids=token_type_ids)
        y_pred_prob = output[0]
        y_pred_label = y_pred_prob.argmax(dim=1)
        print(y_pred_label)
コード例 #21
0
            'params': [p for n, p in param_optimizer if
                       any(nd in n for nd in no_decay)], 'weight_decay': 0.0
        }]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr)

    if args.mode == 'test':
        test = FeverDataset(args.test_dataset)
        # print(Counter([_x['label'] for _x in test]).most_common(3))
        test_dl = BucketBatchSampler(batch_size=args.batch_size,
                                     sort_key=sort_key, dataset=test,
                                     collate_fn=collate_fn)

        checkpoint = torch.load(args.model_path)

        model.load_state_dict(checkpoint['model'])
        print(eval_model(model, test_dl))

    else:
        print("Loading datasets...")
        train = FeverDataset(args.train_dataset)
        dev = FeverDataset(args.dev_dataset)

        # print(Counter([_x['label'] for _x in train]).most_common(3))
        # print(Counter([_x['label'] for _x in dev]).most_common(3))

        train_dl = BucketBatchSampler(batch_size=args.batch_size,
                                      sort_key=sort_key,
                                      dataset=train,
                                      collate_fn=collate_fn)
        dev_dl = BucketBatchSampler(batch_size=args.batch_size,
class BERT():
  def __init__(self, model_path=None, config=None):
    #self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    self.device = torch.device("cpu")

    # load tokenizer
    self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

    # load model configuration
    if config is None:
        config = BertConfig()

    # path to save model file
    if model_path is None:
      base_dir = os.path.dirname(os.path.realpath(__file__))
      model_dir = os.path.join(base_dir, '.models')

      os.makedirs(model_dir, exist_ok=True)

      url = "https://www.dropbox.com/s/jw18aln9rmg69d6/BERT_Weights.pt?dl=0"

      model_name = os.path.split(url)[-1][:-5]
      model_path = os.path.join(model_dir, model_name)

      # download model
      if not os.path.exists(model_path):
        subprocess.call(['wget', url, '-O', model_path])

    # load pre-trained model
    self.model = BertForSequenceClassification(config)
    self.model.load_state_dict(torch.load(model_path,  map_location=self.device))

  def preprocess(self, text):
    # encode text
    input_encoded = self.tokenizer.encode_plus(
                        text,
                        add_special_tokens = True,
                        max_length = 64,
                        truncation = True,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )

    # setup BERT parameters
    input_ids = input_encoded["input_ids"]
    attention_mask = input_encoded["attention_mask"]

    # prepare dataset
    pred_data = TensorDataset(input_ids, attention_mask)
    # sample dataset
    pred_sampler = SequentialSampler(pred_data)
    # prepare dataloader
    pred_dl = DataLoader(pred_data, sampler = pred_sampler, batch_size = 1)

    return pred_dl

  @torch.no_grad()
  def predict(self, pred_dl):
    for s, b in enumerate(pred_dl):
      # get batch
      b = tuple(t.to(self.device) for t in b)

      # get BERT parameters
      input_idsx, attention_maskx = b

      # predict
      outs = self.model(
          input_ids = input_idsx,
          attention_mask = attention_maskx,
          token_type_ids = None,
          )

      # predictions
      logits = outs[0]
      logits = logits.detach().cpu().numpy()
      logits = np.argmax(logits, axis=-1).item()

    return logits