Exemple #1
0
def decode(hp):
    tokenizer = BertTokenizer.from_pretrained(hp.bert_model_dir, do_lower_case=False)
    if hp.dataset == "lm_raw_data_finance":
        dict_file = "../data/dataset_finance/annual_report_entity_list"
    elif hp.dataset == "lm_raw_data_novel":
        dict_file = "../data/dataset_book9/entity_book9"
    entity_dict = EntityDict(hp.dataset, dict_file)
    entity_dict.load(os.path.join(hp.bert_model_dir, 'entity.dict'))

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    ner_label = NerLabel([hp.decodeset])
    if os.path.exists(os.path.join(hp.logdir, 'dict.pt')):
        ner_label.load(os.path.join(hp.logdir, 'dict.pt'))
    else:
        print('dict.pt is not exit')
        exit()

    decode_dataset = NerDataset(hp.decodeset, ner_label, tokenizer, entity_dict)

    model = Net(hp.bert_model_dir, hp.top_rnns, len(ner_label.VOCAB), entity_dict.entity_num, device, hp.finetuning).to(device)
    model = nn.DataParallel(model)
    ## Load the model parameters
    if os.path.exists(os.path.join(hp.logdir, 'model.pt')):
        model.load_state_dict(torch.load(os.path.join(hp.logdir, 'model.pt')))
    else:
        print("the pretrianed model path does not exist! ")
        exit()

    decode_iter = data.DataLoader(dataset=decode_dataset,
                                 batch_size=hp.batch_size,
                                 shuffle=True,
                                 num_workers=4,
                                 collate_fn=pad)

    fname = os.path.join(hp.logdir, '_')

    precision, recall, f1 = evaluate(model, decode_iter, fname, ner_label, verbose=False)
Exemple #2
0
if __name__=="__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--batch_size", type=int, default=128)
    parser.add_argument("--lr", type=float, default=0.0001)
    parser.add_argument("--n_epochs", type=int, default=30)
    parser.add_argument("--finetuning", dest="finetuning", action="store_true")
    parser.add_argument("--explainable", dest="explainable", action="store_true")
    parser.add_argument("--top_rnns", dest="top_rnns", action="store_true")
    parser.add_argument("--logdir", type=str, default="checkpoints/01")
    parser.add_argument("--trainset", type=str, default="conll2003/train.txt")
    parser.add_argument("--validset", type=str, default="conll2003/valid.txt")
    hp = parser.parse_args()

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    train_dataset = NerDataset(hp.trainset)
    eval_dataset = NerDataset(hp.validset)

    if hp.explainable:
        print('WARNING: use EXPLAINABLE mode')
        maxlen = max(train_dataset.maxlen, eval_dataset.maxlen)
        _pad = partial(pad, maxlen=maxlen)

        model = Net(top_rnns=hp.top_rnns, vocab_size=len(VOCAB), device=device, finetuning=hp.finetuning,
                    explainable=hp.explainable).to(device)
        model.init_tf(maxlen=maxlen)
        #model = nn.DataParallel(model)
    else:
        model = Net(hp.top_rnns, len(VOCAB), device, hp.finetuning).to(device)
        model = nn.DataParallel(model)
        _pad = pad
Exemple #3
0
def train(hp):
    tokenizer = BertTokenizer.from_pretrained(hp.bert_model_dir, do_lower_case=False)
    if hp.dataset == "lm_raw_data_finance":
        dict_file = "../data/dataset_finance/raw_data/annual_report_entity_list"
    elif hp.dataset == "lm_raw_data_novel":
        dict_file = "../data/dataset_book9/raw_data/entity_book9"
    elif hp.dataset == "lm_raw_data_thuner":
        dict_file = "../data/dataset_thuner/raw_data/thu_entity.txt"
    entity_dict = EntityDict(hp.dataset, dict_file)
    entity_dict.load(os.path.join(hp.bert_model_dir, 'entity.dict'))

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    ner_label = NerLabel([hp.trainset, hp.validset])
    fname = os.path.join(hp.logdir, 'dict.pt')
    ner_label.save(fname)

    train_dataset = NerDataset(hp.trainset, ner_label, tokenizer, entity_dict)
    eval_dataset = NerDataset(hp.validset, ner_label, tokenizer, entity_dict)
    test_dataset = NerDataset(hp.testset, ner_label, tokenizer, entity_dict)

    model = Net(hp.bert_model_dir, hp.top_rnns, len(ner_label.VOCAB), entity_dict.entity_num, device, hp.finetuning).to(device)
    device_ids = [0, 1]
    model = nn.DataParallel(model, device_ids=device_ids)

    train_iter = data.DataLoader(dataset=train_dataset,
                                 batch_size=hp.batch_size,
                                 shuffle=True,
                                 num_workers=4,
                                 collate_fn=pad)
    eval_iter = data.DataLoader(dataset=eval_dataset,
                                batch_size=hp.batch_size,
                                shuffle=False,
                                num_workers=4,
                                collate_fn=pad)
    test_iter = data.DataLoader(dataset=test_dataset,
                                batch_size=hp.batch_size,
                                shuffle=False,
                                num_workers=4,
                                collate_fn=pad)

    optimizer = optim.Adam(model.parameters(), lr=hp.lr)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    ## train the model
    best_eval = -10
    for epoch in range(1, hp.n_epochs + 1):
        train_epoch(model, train_iter, optimizer, criterion, tokenizer)

        print(f"=========eval at epoch={epoch}=========")
        if not os.path.exists(hp.logdir): os.makedirs(hp.logdir)
        fname = os.path.join(hp.logdir, 'model')
        precision, recall, f1 = evaluate(model, eval_iter, fname, ner_label, verbose=False)

        if f1 > best_eval:
            best_eval = f1
            print("epoch{} get the best eval f-score:{}".format(epoch, best_eval))
            torch.save(model.state_dict(), f"{fname}.pt")
            print(f"weights were saved to {fname}.pt")

        print(f"=========test at epoch={epoch}=========")
        if not os.path.exists(hp.logdir): os.makedirs(hp.logdir)
        fname = os.path.join(hp.logdir, str(epoch))
        precision, recall, f1 = evaluate(model, test_iter, fname, ner_label, verbose=False)
Exemple #4
0
    hp = parser.parse_args()

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print("=========build model=========")
    model = Net(hp.top_rnns, len(VOCAB), device, hp.finetuning).cuda()
    # model=1
    print("=========parallel=========")
    model = nn.DataParallel(model)

    df_case = pd.read_csv('df_case.csv')
    # make_ner_txt(df_case.inf,hp.testset)

    print("=========load data=========")

    test_dataset = NerDataset(hp.testset)

    test_iter = data.DataLoader(dataset=test_dataset,
                                batch_size=hp.batch_size,
                                shuffle=False,
                                num_workers=4,
                                collate_fn=pad)

    optimizer = optim.Adam(model.parameters(), lr=hp.lr)
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    # f1_best=0
    # fname = os.path.join(hp.logdir, 'model')

    # print("=========ner=========")
    # test_model=Net(hp.top_rnns, len(VOCAB), device, hp.finetuning)
    # test_model = nn.DataParallel(test_model)
Exemple #5
0
        torch.set_rng_state(state['torch_rng_state'])
        torch.cuda.set_rng_state(state['cuda_rng_state']),
        torch.cuda.set_rng_state_all(state['cuda_rng_state_all']),
        np.random.set_state(state['np_rng_state'])
        random.setstate(state['random_rng_state'])
        if state.get('model') is not None and model is not None:
            model.load_state_dict(state['model'])


if __name__ == "__main__":
    for fold_iter_idx in range(args.fold_num):
        if args.do_train:
            Fold_iter_idx = fold_iter_idx
            # Prepare Documents
            if args.task.lower() == 'bc2gm':
                train_dataset = NerDataset(f"../{args.task}/train.tsv",
                                           args.task, args.pretrain_dir)
                eval_dataset = NerDataset(f"../{args.task}/test.tsv",
                                          args.task, args.pretrain_dir)
            elif args.task.lower() == 'bc6pm':
                if args.do_cross_valid:
                    if fold_iter_idx == 0:
                        if args.train_withGNP:
                            train_data_dir = os.path.join(
                                os.environ.get('BC6PM_dir'), 'GNormPlus',
                                'withAnn-Result',
                                'PMtask_Relations_TrainingSet_r.json')
                        else:
                            train_data_dir = os.path.join(
                                os.environ.get('BC6PM_dir'), 'json',
                                'PMtask_Relations_TrainingSet.json')
                        with open(train_data_dir) as f:
Exemple #6
0
        fout.write(f"precision={score[0]}\n")
        fout.write(f"recall={score[1]}\n")
        fout.write(f"f1={score[2]}\n")

    os.remove(f)

    print("precision=%.2f" % score[0])
    print("recall=%.2f" % score[1])
    print("f1=%.2f" % score[2])
    return score[0], score[1], score[2]


if __name__ == "__main__":

    train_dataset = NerDataset("Data/train.tsv", 'i2b2')
    eval_dataset = NerDataset("Data/test.tsv", 'i2b2')

    # Define model
    config = BertConfig(
        vocab_size_or_config_json_file=parameters.BERT_CONFIG_FILE)
    model = Net(config=config,
                bert_state_dict=model_state_dict,
                vocab_len=len(hp.VOCAB),
                device=hp.device)

    # 'bc5cdr': ('<PAD>', 'B-Chemical', 'O', 'B-Disease' , 'I-Disease', 'I-Chemical'),

    class_sample_count = [
        10, 1, 20, 3, 4
    ]  # dataset has 10 class-1 samples, 1 class-2 samples, etc.
Exemple #7
0
        fout.write(f"precision={precision}\n")
        fout.write(f"recall={recall}\n")
        fout.write(f"f1={f1}\n")

    os.remove(f)

    print("precision=%.2f" % precision)
    print("recall=%.2f" % recall)
    print("f1=%.2f" % f1)
    return precision, recall, f1


if __name__ == "__main__":

    train_dataset = NerDataset("data/train.tsv",
                               'bc5cdr')  # here bc5cdr is dataset type
    eval_dataset = NerDataset("data/test.tsv", 'bc5cdr')
    hp = HParams('bc5cdr')

    # Define model
    config = BertConfig(
        vocab_size_or_config_json_file=parameters.BERT_CONFIG_FILE)
    model = Net(config=config,
                bert_state_dict=state_dict,
                vocab_len=len(hp.VOCAB),
                device=hp.device)
    if torch.cuda.is_available():
        model.cuda()
    model.train()
    # update with already pretrained weight
Exemple #8
0
        fout.write(f"precision={precision}\n")
        fout.write(f"recall={recall}\n")
        fout.write(f"f1={f1}\n")

    os.remove(f)

    print("precision=%.2f"%precision)
    print("recall=%.2f"%recall)
    print("f1=%.2f"%f1)
    return precision, recall, f1

if __name__=="__main__":
    model = Net(training=False)
    model.to('cuda')

    train_dataset = NerDataset("conll2003/train.txt")
    eval_dataset = NerDataset("conll2003/valid.txt")

    train_iter = data.DataLoader(dataset=train_dataset,
                                 batch_size=hp.batch_size,
                                 shuffle=True,
                                 num_workers=4,
                                 collate_fn=pad)
    eval_iter = data.DataLoader(dataset=eval_dataset,
                                 batch_size=hp.batch_size,
                                 shuffle=False,
                                 num_workers=4,
                                 collate_fn=pad)

    optimizer = optim.Adam(model.parameters(), lr = hp.lr)
    # optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)