Esempio n. 1
0
def get_dataloader(data_path, preprocessor, batch_size):
    train_dataset = NerDataset(os.path.join(data_path, "train_data.txt"), preprocessor)
    val_dataset = NerDataset(os.path.join(data_path, "val_data.txt"), preprocessor)
    test_dataset = NerDataset(os.path.join(data_path, "test_data.txt"), preprocessor)

    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, batch_size=batch_size, sampler=train_sampler
    )
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, drop_last=True)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, drop_last=True)

    return train_dataloader, val_dataloader, test_dataloader
Esempio n. 2
0
def run_ner_infer(sent):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    top_rnns = True
    model = Net(top_rnns, len(VOCAB), device, finetuning=True)
    if device == 'cpu':
        model.load_state_dict(
            torch.load('models/banner_model.pt',
                       map_location=torch.device('cpu')))
    elif device == 'cuda':
        model.load_state_dict(torch.load('.models/banner_model.pt'))
    model.to(device)

    tags = []
    for x in range(len(sent.split())):
        tags.append('O')
    sent_infer = []
    sent_infer.append(["[CLS]"] + sent.split() + ["[SEP]"])
    tags_infer = []
    tags_infer.append(["<PAD>"] + tags + ["<PAD>"])

    infer_data = NerDataset(sent_infer, tags_infer)

    infer_iter = torch.utils.data.DataLoader(dataset=infer_data,
                                             batch_size=1,
                                             shuffle=False,
                                             collate_fn=pad,
                                             num_workers=0)
    pred = eval(model, infer_iter)
    for x in range(len(pred[0])):
        if pred[0][x] == '<PAD>':
            pred[0][x] = 'O'
    return sent_infer[0][1:-1], pred[0][1:-1]
Esempio n. 3
0
def main():
    args = parse()
    if args.use_pretrained_embedding:
        model_version = 'embedding/adl-pretrained-model/bert-embedding-epoch-9/'
    else:
        model_version = 'cl-tohoku/bert-base-japanese'

   
    model, cus_model, device = setting(model_version, args)

    if args.bert:
        trainset = EE_dataset(mode='train')
        validset = EE_dataset(mode='valid')
        train(trainset, validset, device, model, cus_model, BATCH_SIZE=args.batch_size)

    elif args.bert_bilstm_crf:
        tokenizer = BertTokenizer.from_pretrained('cl-tohoku/bert-base-japanese', do_lower_case=True)
        trainset = NerDataset('./BIO_data/BIO_data.pkl', tokenizer, tag2idx)
        validset = NerDataset('./BIO_data/BIO_data_dev.pkl', tokenizer, tag2idx)
        train_with_crf(model, trainset, validset, device, 5)
Esempio n. 4
0
    tags_li_train.append(["<PAD>"] + y + ["<PAD>"])

sents_valid, tags_li_valid = [], []
for x in valid_texts:
    sents_valid.append(["[CLS]"] + x + ["[SEP]"])
for y in valid_labels:
    tags_li_valid.append(["<PAD>"] + y + ["<PAD>"])

sents_test, tags_li_test = [], []
for x in test_texts:
    sents_test.append(["[CLS]"] + x + ["[SEP]"])
for y in test_labels:
    tags_li_test.append(["<PAD>"] + y + ["<PAD>"])
    

train_dataset = NerDataset(sents_train, tags_li_train)
eval_dataset = NerDataset(sents_valid, tags_li_valid)
test_dataset = NerDataset(sents_test, tags_li_test)

train_iter = torch.utils.data.DataLoader(dataset=train_dataset,
                             batch_size= batch_size,
                             shuffle=True,
                             collate_fn=pad,
                             num_workers=0
                             )
eval_iter = torch.utils.data.DataLoader(dataset=eval_dataset,
                             batch_size=batch_size,
                             shuffle=False,
                             collate_fn = pad,
                             num_workers=0
                             )
Esempio n. 5
0
        fout.write(f"precision={precision}\n")
        fout.write(f"recall={recall}\n")
        fout.write(f"f1={f1}\n")

    os.remove("temp_bert")
    print("precision=%.4f" % precision)
    print("recall=%.4f" % recall)
    print("f1=%.4f" % f1)
    return precision, recall, f1


model = Net(hp.top_rnns, len(VOCAB), device, hp.finetuning).cuda()
model = nn.DataParallel(model)

train_dataset = NerDataset(hp.trainset)
eval_dataset = NerDataset(hp.validset)

train_iter = data.DataLoader(dataset=train_dataset,
                             batch_size=hp.batch_size,
                             shuffle=True,
                             num_workers=4,
                             collate_fn=pad)
eval_iter = data.DataLoader(dataset=eval_dataset,
                            batch_size=hp.batch_size,
                            shuffle=False,
                            num_workers=4,
                            collate_fn=pad)

optimizer = optim.Adam(model.parameters(), lr=hp.lr)
criterion = nn.CrossEntropyLoss(ignore_index=0)
Esempio n. 6
0
        fout.write(f"precision={precision}\n")
        fout.write(f"recall={recall}\n")
        fout.write(f"f1={f1}\n")

    os.remove(f)

    print("precision=%.2f" % precision)
    print("recall=%.2f" % recall)
    print("f1=%.2f" % f1)
    return precision, recall, f1


if __name__ == "__main__":

    train_dataset = NerDataset("../input/train.tsv",
                               'bc5cdr')  # here bc5cdr is dataset type
    eval_dataset = NerDataset("../input/test.tsv", 'bc5cdr')
    hp = HParams('bc5cdr')

    # Define model
    #config = BertConfig(vocab_size_or_config_json_file=config.BERT_CONFIG_FILE)

    model = Net(config=config.BERT_CONFIG_FILE,
                weight=config.BERT_WEIGHTS,
                vocab_len=len(hp.VOCAB),
                device=hp.device)
    if torch.cuda.is_available():
        model.cuda()
    model.train()
    # update with already pretrained weight