コード例 #1
0
def make_joinlabel_dataset(path,
                           use_entity_token=False,
                           batch_size=16,
                           shuffle=True,
                           num_workers=0):
    import copy
    data = []
    tokenizer = get_tokenizer()
    with open(path, 'r') as f:
        raw_data = f.readlines()
    data_raw_sample = gen_samples(raw_data)
    for text_block in data_raw_sample:
        sample = CDR_Sample(text_list=text_block, tokenize=tokenizer)
        final_sample = sample.make_example_fulltext(
            use_entity_token=use_entity_token)
        data.append(final_sample)

    PS = PadSequenceCDRFulltextJoinLabelDataset(
        token_pad_value=tokenizer.pad_token_id)
    dataset = CDRFulltextJoinLabelDataset(data)
    data_loader = DataLoader(dataset,
                             batch_size=batch_size,
                             shuffle=shuffle,
                             num_workers=0,
                             collate_fn=PS,
                             pin_memory=False)
    return data, data_loader
コード例 #2
0
def make_pretrain_ner_dataset(path,
                              use_entity_token=False,
                              batch_size=16,
                              shuffle=True,
                              num_workers=0):
    import copy
    data = []
    tokenizer = get_tokenizer()
    with open(path, 'r') as f:
        raw_data = f.readlines()
    data_raw_sample = gen_samples(raw_data)
    for text_block in data_raw_sample:
        sample = CDR_Sample(text_list=text_block, tokenize=tokenizer)
        final_sample = sample.extract_ner_sample(
            use_entity_token=use_entity_token)
        data += final_sample

    PS = PadSequenceNERCDRDataset(token_pad_value=tokenizer.pad_token_id)
    dataset = CDRNERDataset(data)
    data_loader = DataLoader(dataset,
                             batch_size=batch_size,
                             shuffle=shuffle,
                             num_workers=0,
                             collate_fn=PS,
                             pin_memory=False)
    return data, data_loader
コード例 #3
0
def make_cdr_non_global_dataset(path,
                                use_entity_token=False,
                                batch_size=16,
                                shuffle=True,
                                num_workers=0,
                                extract_type='intra'):
    data = []
    tokenizer = get_tokenizer()
    with open(path, 'r') as f:
        raw_data = f.readlines()
    data_raw_sample = gen_samples(raw_data)
    for text_block in data_raw_sample:
        sample = CDR_Sample(text_list=text_block, tokenize=tokenizer)
        final_sample = sample.make_example_non_global(
            use_entity_token=use_entity_token, extract_type=extract_type)
        data += final_sample
    PS = PadSequenceCDRSentenceDataset(token_pad_value=tokenizer.pad_token_id)
    dataset = CDRIntraDataset(data)
    data_loader = DataLoader(dataset,
                             batch_size=batch_size,
                             shuffle=shuffle,
                             num_workers=0,
                             collate_fn=PS,
                             pin_memory=False)
    return data, data_loader
コード例 #4
0
def test_extract_data(path):
    data = []
    tokenizer = get_tokenizer()
    list_data_intra, list_data_inter, list_data_global = [], [], []
    with open(path, 'r') as f:
        raw_data = f.readlines()
    data_raw_sample = gen_samples(raw_data)
    list_samples = []
    for text_block in data_raw_sample:
        sample = CDR_Sample(text_list=text_block, tokenize=tokenizer)
        data_intra, data_inter, data_global = sample.extract_intra_inter_sentence(
            extract_inter=True)
        list_data_intra += data_intra
        list_data_inter += data_inter
        list_data_global += data_global
    return list_data_intra, list_data_inter, list_data_global
コード例 #5
0
def read_tacred_data(data_file, label_dict_file):
    with open(data_file, 'r') as f:
        samples = json.load(f)
    print("len sample: ", len(samples))
    from utils.trainer_utils import get_tokenizer
    tokenizer = get_tokenizer()
    data = []
    for sample in samples:
        a_sample = process_sample(sample, tokenizer)
        data.append(a_sample)
    if os.path.isfile(label_dict_file):
        with open(label_dict_file, 'r') as f:
            label_dict = json.load(f)
    else:
        all_labels = list(set([sample['label'] for sample in data]))
        print("all_labels: ", all_labels)
        label_dict = {all_labels[i]: i for i in range(len(all_labels))}
        with open(label_dict_file, 'w') as f:
            json.dump(label_dict, f)
    return data, label_dict
コード例 #6
0
def train(num_epochs=100):
    best_test_results = None
    train_loader = make_cdr_train_dataset(train_path='data/cdr/CDR_TrainingSet.PubTator.txt',
                                          dev_path='data/cdr/CDR_DevelopmentSet.PubTator.txt')
    test_loader = make_cdr_dataset('data/cdr/CDR_TestSet.PubTator.txt')

    tokenizer = get_tokenizer()
    electra_config = ElectraConfig()
    # net = ElectraModelClassification(electra_config)
    net = ElectraModelClassification.from_pretrained('google/electra-small-discriminator')
    # summary(net)
    # for param in net.
    for name, param in net.named_parameters():
        # if 'encoder' in name:
        # param.requires_grad = False
        print("name: {}, unfrozen:{}, size: {}".format(name, param.requires_grad, param.size()))
    # for layer in net:
    #     x = layer(x)
    #     print(x.size())
    if cuda:
        net.cuda()

    criteria = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

    pad_id = tokenizer.pad_token_id

    def train_model(optimizer=None, scheduler=None, tokenizer=None, do_eval=False):
        net.train()
        epoch_loss = []
        all_labels = []
        all_preds = []
        for i, batch in tqdm(enumerate(train_loader)):
            x, masked_entities_encoded_seqs, chemical_code_seqs, disease_code_seqs, label = batch
            # print('label = ', label)
            # label = torch.squeeze(label, 1)
            attention_mask = (x != pad_id).float()
            attention_mask = (1. - attention_mask) * -10000.
            token_type_ids = torch.zeros((x.shape[0], x.shape[1])).long()
            if cuda:
                x = x.cuda()
                label = label.cuda()
                attention_mask = attention_mask.cuda()
                token_type_ids = token_type_ids.cuda()

            prediction = net(x, token_type_ids=token_type_ids,
                             # attention_masks=attention_mask,
                             used_entity_token=False, masked_entities_list=masked_entities_encoded_seqs,
                             chemical_code_list=chemical_code_seqs, disease_code_list=disease_code_seqs)
            # print('learned before = {}'.format(net.projection.weight.data))
            loss = criteria(prediction, label)
            pred = prediction.argmax(dim=-1)
            all_labels.append(label.data.to('cpu'))
            all_preds.append(pred.to('cpu'))

            epoch_loss.append(loss.item())

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        # scheduler.step()

        average_loss = np.mean(epoch_loss)
        new_all_labels = []
        new_all_preds = []
        for i in range(len(all_labels)):
            new_all_labels += all_labels[i].tolist()
            new_all_preds += all_preds[i].tolist()

        from sklearn.metrics import classification_report
        print("average RE loss : ", average_loss)
        print("train_cls report: \n", classification_report(new_all_labels, new_all_preds))
        print("Confusion matrix report: \n", confusion_matrix(new_all_labels, new_all_preds))
        if do_eval:
            evaluate(net, test_loader, tokenizer)

    # optimizer = torch.optim.Adam([{"params": net.parameters(), "lr": 0.01}])
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in net.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
        {
            "params": [p for n, p in net.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=5e-4, eps=1e-8)
    # optimizer = optim.SGD(net.parameters(), lr=0.05)
    # scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[2,4,6,8,12,15,18,20,22,24,26,30], gamma=0.8)
    for epoch in range(num_epochs):
        print('Epoch:', epoch)
        do_eval = False
        if epoch % 1 == 0 or epoch == num_epochs - 1:
            do_eval = True
        res_test = train_model(net, loss_fn=criteria, optimizer=optimizer, scheduler=None, tokenizer=tokenizer,
                               do_eval=do_eval)
        if best_test_results == None or res_test['f1-score'] > best_test_results['f1-score']:
            best_test_results = res_test
        print('Best result on test data: Precision: {}, Recall: {}, F1: {}'.format(best_test_results['precision'],
                                                                                   best_test_results['recall'],
                                                                                   best_test_results['f1-score']))
コード例 #7
0
def train_ner(num_epochs=100, use_entity_token=False):
    best_test_results = None
    best_epoch = None
    _, train_loader = make_train_pretrain_ner_dataset(train_path='data/cdr/CDR_TrainingSet.PubTator.txt',
                                                      dev_path='data/cdr/CDR_DevelopmentSet.PubTator.txt',
                                                      use_entity_token=use_entity_token, batch_size=4)
    _, test_loader = make_pretrain_ner_dataset('data/cdr/CDR_TestSet.PubTator.txt', use_entity_token=use_entity_token,
                                               batch_size=4)
    # _, train_loader = make_cdr_non_global_dataset('data/cdr/CDR_TrainingSet.PubTator.txt', use_entity_token=use_entity_token, extract_type='inter')

    tokenizer = get_tokenizer()

    net = ElectraModelEntityTokenClassification.from_pretrained('google/electra-base-discriminator')
    net.resize_token_embeddings(len(tokenizer))
    # summary(net)
    # for param in net.
    for name, param in net.named_parameters():
        print("name: {}, unfrozen:{}, size: {}".format(name, param.requires_grad, param.size()))

    if cuda:
        net.cuda()

    criteria = torch.nn.CrossEntropyLoss().cuda()

    pad_id = tokenizer.pad_token_id

    def train_model(model, loss_fn=None, optimizer=None, scheduler=None, tokenizer=None, do_eval=False):
        model.train()
        epoch_loss = []
        all_labels = []
        all_preds = []
        for i, batch in enumerate(train_loader):
            x, entity_token_ids, label = batch
            # print('x: ', x)
            attention_mask = (x != pad_id).float()
            attention_mask = (1. - attention_mask) * -10000.
            token_type_ids = torch.zeros((x.shape[0], x.shape[1])).long()
            if cuda:
                x = x.cuda()
                label = label.cuda()
                attention_mask = attention_mask.cuda()
                token_type_ids = token_type_ids.cuda()

            prediction = model(x, token_type_ids=token_type_ids,
                               # attention_masks=attention_mask,
                               entity_token_ids=entity_token_ids)
            loss = loss_fn(prediction.view(-1, 2), label.view(-1))

            pred = prediction.argmax(dim=-1)
            all_labels.append(label.data.to('cpu'))
            all_preds.append(pred.to('cpu'))

            epoch_loss.append(loss.item())

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        average_loss = np.mean(epoch_loss)
        new_all_labels = []
        new_all_preds = []
        for i in range(len(all_labels)):
            new_all_labels += all_labels[i].tolist()
            new_all_preds += all_preds[i].tolist()

        from sklearn.metrics import classification_report
        print("average RE loss : ", average_loss)
        print("train_cls report: \n", classification_report(new_all_labels, new_all_preds))
        print("Confusion matrix report: \n", confusion_matrix(new_all_labels, new_all_preds))
        if do_eval:
            res = evaluate_ner(model, test_loader, tokenizer)
            return res

    # optimizer = torch.optim.Adam([{"params": net.parameters(), "lr": 0.01}])
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in net.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
        {
            "params": [p for n, p in net.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=5e-4, eps=1e-8)

    for epoch in range(num_epochs):
        print('Epoch:', epoch)
        do_eval = False
        if epoch % 1 == 0 or epoch == num_epochs - 1:
            do_eval = True
        res_test = train_model(net, loss_fn=criteria, optimizer=optimizer, scheduler=None, tokenizer=tokenizer,
                               do_eval=do_eval)
        if best_test_results == None or res_test['f1-score'] > best_test_results['f1-score']:
            best_test_results = res_test
            best_epoch = epoch
            net.save_pretrained('models_saved/electra_token_model')
        print('Best result on test data: Precision: {}, Recall: {}, F1: {}'.format(best_test_results['precision'],
                                                                                   best_test_results['recall'],
                                                                                   best_test_results['f1-score']))
        print('Best epoch = ', best_epoch)
コード例 #8
0
def train_sentence(num_epochs=100, use_entity_token=False):
    best_test_results = None
    best_epoch = None
    # _, train_loader = make_cdr_train_non_global_dataset(train_path='data/gda/train.txt',
    #                                                     dev_path='data/cdr/CDR_DevelopmentSet.PubTator.txt',
    #                                                     use_entity_token=use_entity_token, extract_type='intra',
    #                                                     batch_size=8)
    _, train_loader = make_cdr_non_global_dataset('/home/levi/levi/relation_extraction_cdr/data/gda/train.txt', use_entity_token=use_entity_token, extract_type='intra', batch_size=8)
    _, test_loader = make_cdr_non_global_dataset('/home/levi/levi/relation_extraction_cdr/data/gda/test.txt', use_entity_token=use_entity_token,
                                                 extract_type='intra', batch_size=8)
    # _, train_loader = make_cdr_non_global_dataset('data/cdr/CDR_TrainingSet.PubTator.txt', use_entity_token=use_entity_token, extract_type='inter')

    tokenizer = get_tokenizer()
    # electra_config = ElectraConfig.from_pretrained('google/electra-small-discriminator')
    # electra_config.vocab_size = electra_config.vocab_size + 2
    # net = ElectraModelEntitySentenceClassification(electra_config)ElectraModelEntityTokenClassification

    net_ner_pretrained = ElectraModelEntityTokenClassification.from_pretrained('models_saved/electra_token_model')
    net = ElectraModelEntitySentenceClassification.from_pretrained('google/electra-base-discriminator')
    net.resize_token_embeddings(len(tokenizer))
    net_ner_pretrained_encoder_params = net_ner_pretrained.encoder.named_parameters()
    net_encoder_params = net.encoder.named_parameters()

    dict_params_ner = dict(net_ner_pretrained_encoder_params)

    for name1, param1 in net_ner_pretrained_encoder_params:
        if name1 in net_encoder_params:
            dict_params_ner[name1].data.copy_(param1.data)

    net.encoder.load_state_dict(dict_params_ner)

    # summary(net)
    # for param in net.
    for name, param in net.named_parameters():
        print("name: {}, unfrozen:{}, size: {}".format(name, param.requires_grad, param.size()))

    if cuda:
        net.cuda()

    criteria = torch.nn.CrossEntropyLoss().cuda()

    pad_id = tokenizer.pad_token_id

    def train_model(model, loss_fn=None, optimizer=None, scheduler=None, tokenizer=None, do_eval=False):
        model.train()
        epoch_loss = []
        all_labels = []
        all_preds = []
        for i, batch in enumerate(train_loader):
            x, masked_entities_encoded_seqs, chemical_code_seqs, disease_code_seqs, label = batch
            # print('label = ', label)
            # label = torch.squeeze(label, 1)
            # print('x: ', x)
            attention_mask = (x != pad_id).float()
            # attention_mask = (1. - attention_mask) * -10000.
            token_type_ids = torch.zeros((x.shape[0], x.shape[1])).long()
            if cuda:
                x = x.cuda()
                label = label.cuda()
                attention_mask = attention_mask.cuda()
                token_type_ids = token_type_ids.cuda()

            prediction = model(x, token_type_ids=token_type_ids,
                               attention_mask=attention_mask,
                               used_entity_token=False, masked_entities_list=masked_entities_encoded_seqs,
                               chemical_code_list=chemical_code_seqs, disease_code_list=disease_code_seqs)
            loss = loss_fn(prediction.view(-1, 2), label.view(-1))
            # if (i % 100 == 0):
            #     print('label: ', label)
            #     print('pred: ', prediction)
            #     print('loss: ', loss)

            pred = prediction.argmax(dim=-1)
            all_labels.append(label.data.to('cpu'))
            all_preds.append(pred.to('cpu'))

            epoch_loss.append(loss.item())

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        # scheduler.step()

        average_loss = np.mean(epoch_loss)
        new_all_labels = []
        new_all_preds = []
        for i in range(len(all_labels)):
            new_all_labels += all_labels[i].tolist()
            new_all_preds += all_preds[i].tolist()

        from sklearn.metrics import classification_report
        print("average RE loss : ", average_loss)
        print("train_cls report: \n", classification_report(new_all_labels, new_all_preds))
        print("Confusion matrix report: \n", confusion_matrix(new_all_labels, new_all_preds))
        if do_eval:
            res = evaluate_sentence(model, test_loader, tokenizer)
            return res

    # optimizer = torch.optim.Adam([{"params": net.parameters(), "lr": 0.01}])
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in net.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
        {
            "params": [p for n, p in net.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=5e-4, eps=1e-8)

    # optimizer = optim.SGD(net.parameters(), lr=0.05)
    # scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[2,4,6,8,12,15,18,20,22,24,26,30], gamma=0.8)
    for epoch in range(num_epochs):
        print('Epoch:', epoch)
        do_eval = False
        if epoch % 1 == 0 or epoch == num_epochs - 1:
            do_eval = True
        res_test = train_model(net, loss_fn=criteria, optimizer=optimizer, scheduler=None, tokenizer=tokenizer,
                               do_eval=do_eval)
        if best_test_results == None or res_test['f1-score'] > best_test_results['f1-score']:
            best_test_results = res_test
            best_epoch = epoch
        print('Best result on test data: Precision: {}, Recall: {}, F1: {}'.format(best_test_results['precision'],
                                                                                   best_test_results['recall'],
                                                                                   best_test_results['f1-score']))