Esempio n. 1
0
def run():
    """train the model"""
    # set the logger
    utils.set_logger(config.log_dir)
    logging.info("device: {}".format(config.device))
    # 处理数据,分离文本和标签
    processor = Processor(config)
    processor.process()
    logging.info("--------Process Done!--------")
    # 分离出验证集
    word_train, word_dev, label_train, label_dev = load_dev('train')
    # build dataset
    train_dataset = NERDataset(word_train, label_train, config)
    dev_dataset = NERDataset(word_dev, label_dev, config)
    logging.info("--------Dataset Build!--------")
    # get dataset size
    train_size = len(train_dataset)
    # build data_loader
    train_loader = DataLoader(train_dataset, batch_size=config.batch_size,
                              shuffle=True, collate_fn=train_dataset.collate_fn)
    dev_loader = DataLoader(dev_dataset, batch_size=config.batch_size,
                            shuffle=True, collate_fn=dev_dataset.collate_fn)
    logging.info("--------Get Dataloader!--------")
    # Prepare model
    device = config.device
    model = BertNER.from_pretrained(config.roberta_model, num_labels=len(config.label2id))
    model.to(device)
    # Prepare optimizer
    if config.full_fine_tuning:
        # model.named_parameters(): [bert, classifier, crf]
        bert_optimizer = list(model.bert.named_parameters())
        classifier_optimizer = list(model.classifier.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in bert_optimizer if not any(nd in n for nd in no_decay)],
             'weight_decay': config.weight_decay},
            {'params': [p for n, p in bert_optimizer if any(nd in n for nd in no_decay)],
             'weight_decay': 0.0},
            {'params': [p for n, p in classifier_optimizer if not any(nd in n for nd in no_decay)],
             'lr': config.learning_rate * 5, 'weight_decay': config.weight_decay},
            {'params': [p for n, p in classifier_optimizer if any(nd in n for nd in no_decay)],
             'lr': config.learning_rate * 5, 'weight_decay': 0.0},
            {'params': model.crf.parameters(), 'lr': config.learning_rate * 5}
        ]
    # only fine-tune the head classifier
    else:
        param_optimizer = list(model.classifier.named_parameters())
        optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer]}]
    optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate, correct_bias=False)
    train_steps_per_epoch = train_size // config.batch_size
    scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                num_warmup_steps=(config.epoch_num // 10) * train_steps_per_epoch,
                                                num_training_steps=config.epoch_num * train_steps_per_epoch)

    # Train the model
    logging.info("--------Start Training!--------")
    train(train_loader, dev_loader, model, optimizer, scheduler, config.model_dir)
Esempio n. 2
0
def simple_run():
    """train without k-fold"""
    # set the logger
    utils.set_logger(config.log_dir)
    # 设置gpu为命令行参数指定的id
    if config.gpu != '':
        device = torch.device(f"cuda:{config.gpu}")
    else:
        device = torch.device("cpu")
    logging.info("device: {}".format(device))
    # 处理数据,分离文本和标签
    processor = Processor(config)
    processor.data_process()
    # 建立词表
    vocab = Vocabulary(config)
    vocab.get_vocab()
    # 分离出验证集
    word_train, word_dev, label_train, label_dev = dev_split(config.train_dir)
    # simple run without k-fold
    run(word_train, label_train, word_dev, label_dev, vocab, device)
Esempio n. 3
0
def k_fold_run():
    """train with k-fold"""
    # set the logger
    utils.set_logger(config.log_dir)
    # 设置gpu为命令行参数指定的id
    if config.gpu != '':
        device = torch.device(f"cuda:{config.gpu}")
    else:
        device = torch.device("cpu")
    logging.info("device: {}".format(device))
    # 处理数据,分离文本和标签
    processor = Processor(config)
    processor.data_process()
    # 建立词表
    vocab = Vocabulary(config)
    vocab.get_vocab()
    # 分离出验证集
    data = np.load(config.train_dir, allow_pickle=True)
    words = data["words"]
    labels = data["labels"]
    kf = KFold(n_splits=config.n_split)
    kf_data = kf.split(words, labels)
    kf_index = 0
    total_test_loss = 0
    total_f1 = 0
    for train_index, dev_index in kf_data:
        kf_index += 1
        word_train = words[train_index]
        label_train = labels[train_index]
        word_dev = words[dev_index]
        label_dev = labels[dev_index]
        test_loss, f1 = run(word_train, label_train, word_dev, label_dev,
                            vocab, device, kf_index)
        total_test_loss += test_loss
        total_f1 += f1
    average_test_loss = float(total_test_loss) / config.n_split
    average_f1 = float(total_f1) / config.n_split
    logging.info("Average test loss: {} , average f1 score: {}".format(
        average_test_loss, average_f1))
        print(tags.size())  # torch.Size([1, 1, 512])
        tags = tags.squeeze(0).cpu().numpy().tolist()

    preds = tags[0][1:-1]  # 取出CLS SEP
    label_entities = get_entities(preds, id2label)
    json_d = {}
    json_d['tag_seq'] = ' '.join([id2label[x] for x in preds])
    json_d['entities'] = label_entities
    print(tokens[1:-1])
    print(json_d['tag_seq'].split(' ')[:input_len])
    print(len(tokens[1:-1]))
    print(len(json_d['tag_seq'].split(' ')[:input_len]))


if __name__ == '__main__':
    processor = Processor()
    label_list = processor.get_labels()
    # 将标签进行id映射
    id2label = {i: label for i, label in enumerate(label_list)}
    label2id = {label: i for i, label in enumerate(label_list)}

    num_labels = len(label_list)
    # s = '常建良,男,1963年出生,工科学士,高级工程师,北京物资学院客座副教授。'
    s = [
        '1', '9', '6', '6', '年', '出', '生', ',', '汉', '族', ',', '中', '共', '党',
        '员', ',', '本', '科', '学', '历', ',', '工', '程', '师', '、', '美', '国', '项',
        '目', '管', '理', '协', '会', '注', '册', '会', '员', '(', 'P', 'M', 'I', 'M',
        'e', 'm', 'b', 'e', 'r', ')', '、', '注', '册', '项', '目', '管', '理', '专',
        '家', '(', 'P', 'M', 'P', ')', '、', '项', '目', '经', '理', '。'
    ]
    tokenizer = BertTokenizer.from_pretrained('./bert_pretrain/vocab.txt')
Esempio n. 5
0
def pharmacy_counting():
    start_time = time.time()
    processor = Processor(input_file, output_file)
    processor.process()
    print("program executed: %s" % (time.time() - start_time))
Esempio n. 6
0
        config.embedding_dir, binary=False, encoding='utf-8')
    vocab_size = len(vocab) + 1
    embed_size = config.embedding_size
    weight = torch.zeros(vocab_size, embed_size)
    cnt = 0
    for i in range(len(word2vec_model.index_to_key)):
        try:
            index = vocab.word_id(word2vec_model.index_to_key[i])
        except:
            continue
        cnt += 1
        weight[index, :] = torch.from_numpy(word2vec_model.get_vector(
            vocab.id_word(vocab.word_id(word2vec_model.index_to_key[i]))))
    logging.info("--------Pretrained Embedding Loaded ! ({}/{})--------".format(cnt, len(vocab)))
    return weight


if __name__ == "__main__":
    from data_process import Processor
    from Vocabulary import Vocabulary
    processor = Processor(config)
    processor.data_process()
    # 建立词表
    vocab = Vocabulary(config)
    vocab.get_vocab()
    matrix, emb_vocab, size, l = load_embedding_manually(config.embedding_dir)
    print(emb_vocab['i2w'][4])  # 大
    print(vocab.word_id(emb_vocab['i2w'][4]))  # 15
    w = embedding(vocab)