def make_joinlabel_dataset(path, use_entity_token=False, batch_size=16, shuffle=True, num_workers=0): import copy data = [] tokenizer = get_tokenizer() with open(path, 'r') as f: raw_data = f.readlines() data_raw_sample = gen_samples(raw_data) for text_block in data_raw_sample: sample = CDR_Sample(text_list=text_block, tokenize=tokenizer) final_sample = sample.make_example_fulltext( use_entity_token=use_entity_token) data.append(final_sample) PS = PadSequenceCDRFulltextJoinLabelDataset( token_pad_value=tokenizer.pad_token_id) dataset = CDRFulltextJoinLabelDataset(data) data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=0, collate_fn=PS, pin_memory=False) return data, data_loader
def make_pretrain_ner_dataset(path, use_entity_token=False, batch_size=16, shuffle=True, num_workers=0): import copy data = [] tokenizer = get_tokenizer() with open(path, 'r') as f: raw_data = f.readlines() data_raw_sample = gen_samples(raw_data) for text_block in data_raw_sample: sample = CDR_Sample(text_list=text_block, tokenize=tokenizer) final_sample = sample.extract_ner_sample( use_entity_token=use_entity_token) data += final_sample PS = PadSequenceNERCDRDataset(token_pad_value=tokenizer.pad_token_id) dataset = CDRNERDataset(data) data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=0, collate_fn=PS, pin_memory=False) return data, data_loader
def make_cdr_non_global_dataset(path, use_entity_token=False, batch_size=16, shuffle=True, num_workers=0, extract_type='intra'): data = [] tokenizer = get_tokenizer() with open(path, 'r') as f: raw_data = f.readlines() data_raw_sample = gen_samples(raw_data) for text_block in data_raw_sample: sample = CDR_Sample(text_list=text_block, tokenize=tokenizer) final_sample = sample.make_example_non_global( use_entity_token=use_entity_token, extract_type=extract_type) data += final_sample PS = PadSequenceCDRSentenceDataset(token_pad_value=tokenizer.pad_token_id) dataset = CDRIntraDataset(data) data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=0, collate_fn=PS, pin_memory=False) return data, data_loader
def test_extract_data(path): data = [] tokenizer = get_tokenizer() list_data_intra, list_data_inter, list_data_global = [], [], [] with open(path, 'r') as f: raw_data = f.readlines() data_raw_sample = gen_samples(raw_data) list_samples = [] for text_block in data_raw_sample: sample = CDR_Sample(text_list=text_block, tokenize=tokenizer) data_intra, data_inter, data_global = sample.extract_intra_inter_sentence( extract_inter=True) list_data_intra += data_intra list_data_inter += data_inter list_data_global += data_global return list_data_intra, list_data_inter, list_data_global
def read_tacred_data(data_file, label_dict_file): with open(data_file, 'r') as f: samples = json.load(f) print("len sample: ", len(samples)) from utils.trainer_utils import get_tokenizer tokenizer = get_tokenizer() data = [] for sample in samples: a_sample = process_sample(sample, tokenizer) data.append(a_sample) if os.path.isfile(label_dict_file): with open(label_dict_file, 'r') as f: label_dict = json.load(f) else: all_labels = list(set([sample['label'] for sample in data])) print("all_labels: ", all_labels) label_dict = {all_labels[i]: i for i in range(len(all_labels))} with open(label_dict_file, 'w') as f: json.dump(label_dict, f) return data, label_dict
def train(num_epochs=100): best_test_results = None train_loader = make_cdr_train_dataset(train_path='data/cdr/CDR_TrainingSet.PubTator.txt', dev_path='data/cdr/CDR_DevelopmentSet.PubTator.txt') test_loader = make_cdr_dataset('data/cdr/CDR_TestSet.PubTator.txt') tokenizer = get_tokenizer() electra_config = ElectraConfig() # net = ElectraModelClassification(electra_config) net = ElectraModelClassification.from_pretrained('google/electra-small-discriminator') # summary(net) # for param in net. for name, param in net.named_parameters(): # if 'encoder' in name: # param.requires_grad = False print("name: {}, unfrozen:{}, size: {}".format(name, param.requires_grad, param.size())) # for layer in net: # x = layer(x) # print(x.size()) if cuda: net.cuda() criteria = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id) pad_id = tokenizer.pad_token_id def train_model(optimizer=None, scheduler=None, tokenizer=None, do_eval=False): net.train() epoch_loss = [] all_labels = [] all_preds = [] for i, batch in tqdm(enumerate(train_loader)): x, masked_entities_encoded_seqs, chemical_code_seqs, disease_code_seqs, label = batch # print('label = ', label) # label = torch.squeeze(label, 1) attention_mask = (x != pad_id).float() attention_mask = (1. - attention_mask) * -10000. token_type_ids = torch.zeros((x.shape[0], x.shape[1])).long() if cuda: x = x.cuda() label = label.cuda() attention_mask = attention_mask.cuda() token_type_ids = token_type_ids.cuda() prediction = net(x, token_type_ids=token_type_ids, # attention_masks=attention_mask, used_entity_token=False, masked_entities_list=masked_entities_encoded_seqs, chemical_code_list=chemical_code_seqs, disease_code_list=disease_code_seqs) # print('learned before = {}'.format(net.projection.weight.data)) loss = criteria(prediction, label) pred = prediction.argmax(dim=-1) all_labels.append(label.data.to('cpu')) all_preds.append(pred.to('cpu')) epoch_loss.append(loss.item()) loss.backward() optimizer.step() optimizer.zero_grad() # scheduler.step() average_loss = np.mean(epoch_loss) new_all_labels = [] new_all_preds = [] for i in range(len(all_labels)): new_all_labels += all_labels[i].tolist() new_all_preds += all_preds[i].tolist() from sklearn.metrics import classification_report print("average RE loss : ", average_loss) print("train_cls report: \n", classification_report(new_all_labels, new_all_preds)) print("Confusion matrix report: \n", confusion_matrix(new_all_labels, new_all_preds)) if do_eval: evaluate(net, test_loader, tokenizer) # optimizer = torch.optim.Adam([{"params": net.parameters(), "lr": 0.01}]) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in net.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, { "params": [p for n, p in net.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=5e-4, eps=1e-8) # optimizer = optim.SGD(net.parameters(), lr=0.05) # scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[2,4,6,8,12,15,18,20,22,24,26,30], gamma=0.8) for epoch in range(num_epochs): print('Epoch:', epoch) do_eval = False if epoch % 1 == 0 or epoch == num_epochs - 1: do_eval = True res_test = train_model(net, loss_fn=criteria, optimizer=optimizer, scheduler=None, tokenizer=tokenizer, do_eval=do_eval) if best_test_results == None or res_test['f1-score'] > best_test_results['f1-score']: best_test_results = res_test print('Best result on test data: Precision: {}, Recall: {}, F1: {}'.format(best_test_results['precision'], best_test_results['recall'], best_test_results['f1-score']))
def train_ner(num_epochs=100, use_entity_token=False): best_test_results = None best_epoch = None _, train_loader = make_train_pretrain_ner_dataset(train_path='data/cdr/CDR_TrainingSet.PubTator.txt', dev_path='data/cdr/CDR_DevelopmentSet.PubTator.txt', use_entity_token=use_entity_token, batch_size=4) _, test_loader = make_pretrain_ner_dataset('data/cdr/CDR_TestSet.PubTator.txt', use_entity_token=use_entity_token, batch_size=4) # _, train_loader = make_cdr_non_global_dataset('data/cdr/CDR_TrainingSet.PubTator.txt', use_entity_token=use_entity_token, extract_type='inter') tokenizer = get_tokenizer() net = ElectraModelEntityTokenClassification.from_pretrained('google/electra-base-discriminator') net.resize_token_embeddings(len(tokenizer)) # summary(net) # for param in net. for name, param in net.named_parameters(): print("name: {}, unfrozen:{}, size: {}".format(name, param.requires_grad, param.size())) if cuda: net.cuda() criteria = torch.nn.CrossEntropyLoss().cuda() pad_id = tokenizer.pad_token_id def train_model(model, loss_fn=None, optimizer=None, scheduler=None, tokenizer=None, do_eval=False): model.train() epoch_loss = [] all_labels = [] all_preds = [] for i, batch in enumerate(train_loader): x, entity_token_ids, label = batch # print('x: ', x) attention_mask = (x != pad_id).float() attention_mask = (1. - attention_mask) * -10000. token_type_ids = torch.zeros((x.shape[0], x.shape[1])).long() if cuda: x = x.cuda() label = label.cuda() attention_mask = attention_mask.cuda() token_type_ids = token_type_ids.cuda() prediction = model(x, token_type_ids=token_type_ids, # attention_masks=attention_mask, entity_token_ids=entity_token_ids) loss = loss_fn(prediction.view(-1, 2), label.view(-1)) pred = prediction.argmax(dim=-1) all_labels.append(label.data.to('cpu')) all_preds.append(pred.to('cpu')) epoch_loss.append(loss.item()) loss.backward() optimizer.step() optimizer.zero_grad() average_loss = np.mean(epoch_loss) new_all_labels = [] new_all_preds = [] for i in range(len(all_labels)): new_all_labels += all_labels[i].tolist() new_all_preds += all_preds[i].tolist() from sklearn.metrics import classification_report print("average RE loss : ", average_loss) print("train_cls report: \n", classification_report(new_all_labels, new_all_preds)) print("Confusion matrix report: \n", confusion_matrix(new_all_labels, new_all_preds)) if do_eval: res = evaluate_ner(model, test_loader, tokenizer) return res # optimizer = torch.optim.Adam([{"params": net.parameters(), "lr": 0.01}]) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in net.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, { "params": [p for n, p in net.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=5e-4, eps=1e-8) for epoch in range(num_epochs): print('Epoch:', epoch) do_eval = False if epoch % 1 == 0 or epoch == num_epochs - 1: do_eval = True res_test = train_model(net, loss_fn=criteria, optimizer=optimizer, scheduler=None, tokenizer=tokenizer, do_eval=do_eval) if best_test_results == None or res_test['f1-score'] > best_test_results['f1-score']: best_test_results = res_test best_epoch = epoch net.save_pretrained('models_saved/electra_token_model') print('Best result on test data: Precision: {}, Recall: {}, F1: {}'.format(best_test_results['precision'], best_test_results['recall'], best_test_results['f1-score'])) print('Best epoch = ', best_epoch)
def train_sentence(num_epochs=100, use_entity_token=False): best_test_results = None best_epoch = None # _, train_loader = make_cdr_train_non_global_dataset(train_path='data/gda/train.txt', # dev_path='data/cdr/CDR_DevelopmentSet.PubTator.txt', # use_entity_token=use_entity_token, extract_type='intra', # batch_size=8) _, train_loader = make_cdr_non_global_dataset('/home/levi/levi/relation_extraction_cdr/data/gda/train.txt', use_entity_token=use_entity_token, extract_type='intra', batch_size=8) _, test_loader = make_cdr_non_global_dataset('/home/levi/levi/relation_extraction_cdr/data/gda/test.txt', use_entity_token=use_entity_token, extract_type='intra', batch_size=8) # _, train_loader = make_cdr_non_global_dataset('data/cdr/CDR_TrainingSet.PubTator.txt', use_entity_token=use_entity_token, extract_type='inter') tokenizer = get_tokenizer() # electra_config = ElectraConfig.from_pretrained('google/electra-small-discriminator') # electra_config.vocab_size = electra_config.vocab_size + 2 # net = ElectraModelEntitySentenceClassification(electra_config)ElectraModelEntityTokenClassification net_ner_pretrained = ElectraModelEntityTokenClassification.from_pretrained('models_saved/electra_token_model') net = ElectraModelEntitySentenceClassification.from_pretrained('google/electra-base-discriminator') net.resize_token_embeddings(len(tokenizer)) net_ner_pretrained_encoder_params = net_ner_pretrained.encoder.named_parameters() net_encoder_params = net.encoder.named_parameters() dict_params_ner = dict(net_ner_pretrained_encoder_params) for name1, param1 in net_ner_pretrained_encoder_params: if name1 in net_encoder_params: dict_params_ner[name1].data.copy_(param1.data) net.encoder.load_state_dict(dict_params_ner) # summary(net) # for param in net. for name, param in net.named_parameters(): print("name: {}, unfrozen:{}, size: {}".format(name, param.requires_grad, param.size())) if cuda: net.cuda() criteria = torch.nn.CrossEntropyLoss().cuda() pad_id = tokenizer.pad_token_id def train_model(model, loss_fn=None, optimizer=None, scheduler=None, tokenizer=None, do_eval=False): model.train() epoch_loss = [] all_labels = [] all_preds = [] for i, batch in enumerate(train_loader): x, masked_entities_encoded_seqs, chemical_code_seqs, disease_code_seqs, label = batch # print('label = ', label) # label = torch.squeeze(label, 1) # print('x: ', x) attention_mask = (x != pad_id).float() # attention_mask = (1. - attention_mask) * -10000. token_type_ids = torch.zeros((x.shape[0], x.shape[1])).long() if cuda: x = x.cuda() label = label.cuda() attention_mask = attention_mask.cuda() token_type_ids = token_type_ids.cuda() prediction = model(x, token_type_ids=token_type_ids, attention_mask=attention_mask, used_entity_token=False, masked_entities_list=masked_entities_encoded_seqs, chemical_code_list=chemical_code_seqs, disease_code_list=disease_code_seqs) loss = loss_fn(prediction.view(-1, 2), label.view(-1)) # if (i % 100 == 0): # print('label: ', label) # print('pred: ', prediction) # print('loss: ', loss) pred = prediction.argmax(dim=-1) all_labels.append(label.data.to('cpu')) all_preds.append(pred.to('cpu')) epoch_loss.append(loss.item()) loss.backward() optimizer.step() optimizer.zero_grad() # scheduler.step() average_loss = np.mean(epoch_loss) new_all_labels = [] new_all_preds = [] for i in range(len(all_labels)): new_all_labels += all_labels[i].tolist() new_all_preds += all_preds[i].tolist() from sklearn.metrics import classification_report print("average RE loss : ", average_loss) print("train_cls report: \n", classification_report(new_all_labels, new_all_preds)) print("Confusion matrix report: \n", confusion_matrix(new_all_labels, new_all_preds)) if do_eval: res = evaluate_sentence(model, test_loader, tokenizer) return res # optimizer = torch.optim.Adam([{"params": net.parameters(), "lr": 0.01}]) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in net.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, { "params": [p for n, p in net.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=5e-4, eps=1e-8) # optimizer = optim.SGD(net.parameters(), lr=0.05) # scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[2,4,6,8,12,15,18,20,22,24,26,30], gamma=0.8) for epoch in range(num_epochs): print('Epoch:', epoch) do_eval = False if epoch % 1 == 0 or epoch == num_epochs - 1: do_eval = True res_test = train_model(net, loss_fn=criteria, optimizer=optimizer, scheduler=None, tokenizer=tokenizer, do_eval=do_eval) if best_test_results == None or res_test['f1-score'] > best_test_results['f1-score']: best_test_results = res_test best_epoch = epoch print('Best result on test data: Precision: {}, Recall: {}, F1: {}'.format(best_test_results['precision'], best_test_results['recall'], best_test_results['f1-score']))