Exemple #1
0
def mymodel_test(logger, test_dataloader, the_time=my_time):
    config = ElectraConfig.from_pretrained(args.mymodel_config_dir)
    embedding = ElectraModel(config=config)
    model = EmtClassifyModel(config=config, args=args)
    output_model_file = os.path.join(args.mymodel_save_dir, 'embedding/')
    model_state_dict = torch.load(
        os.path.join(output_model_file, the_time + 'pytorch_model.bin'))
    embedding.load_state_dict(model_state_dict)
    model.load(os.path.join(args.mymodel_save_dir, the_time + "mymodel.bin"))
    if args.fp16:
        embedding.half()
        model.half()
    embedding.to(device)
    model.to(device)
    embedding.eval()
    model.eval()
    acc_records = []
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps = 0
    for step, batch in enumerate(test_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_labels = batch
        b_input_ids = b_input_ids.squeeze(1).long()
        with torch.no_grad():
            text_embedding = embedding(input_ids=b_input_ids)
            tmp_eval_accuracy = model.test(text_embedding, b_labels)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
    try:
        logger.info('准确率为:{:.2f}%'.format(100 * eval_accuracy / nb_eval_steps))
        acc_records.append(eval_accuracy / nb_eval_steps)
    except ZeroDivisionError:
        logger.info("错误!请降低batch大小")
    return acc_records
Exemple #2
0
def mymodel_pred(logger, text, the_time=my_time):
    config = ElectraConfig.from_pretrained(args.mymodel_config_dir)
    embedding = ElectraModel(config=config)
    model = EmtClassifyModel(config=config, args=args)
    output_model_file = os.path.join(args.mymodel_save_dir, 'embedding/')
    model_state_dict = torch.load(
        os.path.join(output_model_file, the_time + 'pytorch_model.bin'))
    embedding.load_state_dict(model_state_dict)
    output_model_file = os.path.join(args.mymodel_save_dir,
                                     the_time + "mymodel.bin")
    model_state_dict = torch.load(output_model_file)
    model.load_state_dict(model_state_dict)
    if args.fp16:
        embedding.half()
        model.half()
    embedding.to(device)
    model.to(device)
    embedding.eval()
    model.eval()
    tokenizer = ElectraTokenizer.from_pretrained(args.vocab_dir)
    input_ids, _, _ = text2ids(tokenizer, text, args.max_sent_len)
    input_ids = torch.Tensor(input_ids).to(device=device)
    input_ids = input_ids.squeeze(1).long()
    with torch.no_grad():
        text_embedding = embedding(input_ids=input_ids)
        pred = model.get_guess(text_embedding)
    print(args.label2emt[label_from_output(pred[0].to('cpu')).item()])
    return
Exemple #3
0
def mymodel_cal(logger, test_dataloader, the_time=my_time):
    config = ElectraConfig.from_pretrained(args.mymodel_config_dir)
    embedding = ElectraModel(config=config)
    model = RelClassifyModel(config=config, args=args)
    output_model_file = os.path.join(args.mymodel_save_dir, 'embedding/')
    model_state_dict = torch.load(
        os.path.join(output_model_file, the_time + 'pytorch_model.bin'))
    embedding.load_state_dict(model_state_dict)
    output_model_file = os.path.join(args.mymodel_save_dir,
                                     the_time + "mymodel.bin")
    model_state_dict = torch.load(output_model_file)
    model.load_state_dict(model_state_dict)
    if args.fp16:
        embedding.half()
        model.half()
    embedding.to(device)
    model.to(device)
    embedding.eval()
    model.eval()
    target_size = len(args.rel2label)
    result = np.zeros([target_size, target_size])
    for step, batch in enumerate(test_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids1, b_input_ids2, b_labels = batch
        b_input_ids1 = b_input_ids1.squeeze(1).long()
        b_input_ids2 = b_input_ids2.squeeze(1).long()
        with torch.no_grad():
            text_embedding1 = embedding(input_ids=b_input_ids1)
            text_embedding2 = embedding(input_ids=b_input_ids2,
                                        token_type_ids=torch.ones(
                                            b_input_ids2.size(),
                                            dtype=torch.long,
                                            device=device))
            pred = model.get_guess(text_embedding1, text_embedding2)
        size = pred.size()[0]
        for i in range(size):
            try:
                result[b_labels[i], label_from_output(pred[i])] += 1
            except:
                continue
    print(result)
    return result
Exemple #4
0
def mymodel_train(args, logger, train_dataloader, validation_dataloader):
    config = ElectraConfig.from_pretrained(args.mymodel_config_dir)
    embedding = ElectraModel(config=config)
    model = NerModel(config=config)
    try:
        output_model_file = os.path.join(args.mymodel_save_dir, 'embedding/')
        model_state_dict = torch.load(
            os.path.join(output_model_file, 'pytorch_model.bin'))
        embedding.load_state_dict(model_state_dict)
    except OSError:
        embedding.from_pretrained(os.path.join(args.pretrained_model_dir,
                                               'pytorch_model.bin'),
                                  config=config)
        print("PretrainedEmbeddingNotFound")
    try:
        output_model_file = os.path.join(args.mymodel_save_dir, "mymodel.bin")
        model_state_dict = torch.load(output_model_file)
        model.load_state_dict(model_state_dict)
    except OSError:
        print("PretrainedMyModelNotFound")
    if args.fp16:
        embedding.half()
        model.half()
    embedding.to(device)
    model.to(device)
    param_optimizer1 = list(embedding.named_parameters())
    param_optimizer2 = list(model.named_parameters())
    optimizer_grouped_parameters1 = [
        {
            'params': [
                p for n, p in param_optimizer1
                if any(nd in n for nd in ['embeddings'])
            ],
            'weight_decay_rate':
            args.weight_decay,
            'lr':
            args.embeddings_lr
        },
        {
            'params': [
                p for n, p in param_optimizer1
                if not any(nd in n for nd in ['embeddings'])
            ],
            'lr':
            args.encoder_lr
        },
    ]
    optimizer_grouped_parameters2 = [
        {
            'params': [
                p for n, p in param_optimizer2
                if any(nd in n for nd in ['encoder'])
            ],
            'lr':
            args.encoder_lr
        },
        {
            'params': [
                p for n, p in param_optimizer2
                if not any(nd in n for nd in ['encoder'])
            ],
            'lr':
            args.learning_rate
        },
    ]
    optimizer1 = Ranger(optimizer_grouped_parameters1)
    optimizer2 = Ranger(optimizer_grouped_parameters2)
    epochs = args.train_epochs
    bio_records = []
    train_loss_set = []
    acc_records = []
    embedding.train()
    model.train()
    for _ in trange(epochs, desc='Epochs'):
        tr_loss = 0
        eval_loss, eval_accuracy = 0, 0
        nb_tr_steps = 0
        nb_eval_steps = 0
        tmp_loss = []
        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            optimizer1.zero_grad()
            optimizer2.zero_grad()
            text_embedding = embedding(input_ids=b_input_ids.squeeze(1).long(),
                                       attention_mask=b_input_mask)
            loss, tmp_eval_accuracy = model(text_embedding, b_labels,
                                            b_input_mask.squeeze(1))
            loss.backward()
            optimizer1.step()
            optimizer2.step()
            torch.cuda.empty_cache()
            tr_loss += loss.item()
            nb_tr_steps += 1
            eval_accuracy += tmp_eval_accuracy
            nb_eval_steps += 1
            tmp_loss.append(loss.item())
        adjust_learning_rate(optimizer1, 0.9)
        adjust_learning_rate(optimizer2, 0.9)
        try:
            train_loss_set.append(tr_loss / nb_tr_steps)
            logger.info('mymodel训练损失:{:.2f},准确率为:{:.2f}%'.format(
                tr_loss / nb_tr_steps, 100 * eval_accuracy / nb_eval_steps))
            acc_records.append(eval_accuracy / nb_eval_steps)
            bio_records.append(np.mean(train_loss_set))
        except ZeroDivisionError:
            logger.info("错误!请降低batch大小")
        embedding_to_save = embedding.module if hasattr(
            embedding, 'module') else embedding
        torch.save(
            embedding_to_save.state_dict(),
            os.path.join(os.path.join(args.mymodel_save_dir, 'embedding/'),
                         'pytorch_model.bin'))
        model_to_save = model.module if hasattr(model, 'module') else model
        torch.save(model_to_save.state_dict(),
                   os.path.join(args.mymodel_save_dir, "mymodel.bin"))
    return embedding, model
    text_classify_embedding.load_state_dict(model_state_dict)
    output_model_file = os.path.join(args.text_classify_model_save_dir, "mymodel.bin")
    model_state_dict = torch.load(output_model_file)
    text_classify_model.load_state_dict(model_state_dict)

    sent_ner_config = ElectraConfig.from_pretrained(args.sent_ner_model_config_dir)
    sent_ner_embedding = ElectraModel(config=sent_ner_config)
    sent_ner_model = NerModel(config=sent_ner_config)
    output_model_file = os.path.join(args.sent_ner_model_save_dir, 'embedding/')
    model_state_dict = torch.load(os.path.join(output_model_file, 'pytorch_model.bin'))
    sent_ner_embedding.load_state_dict(model_state_dict)
    output_model_file = os.path.join(args.sent_ner_model_save_dir, "mymodel.bin")
    model_state_dict = torch.load(output_model_file)
    sent_ner_model.load_state_dict(model_state_dict)

    text_classify_embedding.to(device)
    text_classify_model.to(device)
    sent_ner_embedding.to(device)
    sent_ner_model.to(device)

    text_classify_embedding.eval()
    text_classify_model.eval()
    sent_ner_embedding.eval()
    sent_ner_model.eval()

    text_classify_model = MyTextClassifyModel(text_classify_model)
    sent_ner_model = MyNerModel(sent_ner_model)
    text_classify = MyTextClassify(text_classify_embedding, text_classify_model, tokenizer)
    sent_ner = MySentNer(sent_ner_embedding, sent_ner_model, tokenizer)

    # 功能选择