def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) infer_opts(parser) parser.add_argument("--max_choices_num", default=10, type=int, help="The maximum number of cadicate answer, shorter than this will be padded.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = CharTokenizer(args) # Build classification model and load parameters. model = MultipleChoice(args) model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format(torch.cuda.device_count())) model = torch.nn.DataParallel(model) dataset = read_dataset(args, args.test_path, None) model.eval() batch_size = args.batch_size results_final = [] dataset_by_group = {} print("The number of prediction instances: ", len(dataset)) for example in dataset: if example[-1] not in dataset_by_group: dataset_by_group[example[-1]] = [example] else: dataset_by_group[example[-1]].append(example) for group_index, examples in dataset_by_group.items(): src = torch.LongTensor([example[0] for example in examples]) tgt = torch.LongTensor([example[1] for example in examples]) seg = torch.LongTensor([example[2] for example in examples]) index = 0 results = [] for i, (src_batch, _, seg_batch, _) in enumerate(batch_loader(batch_size, src, tgt, seg)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) with torch.no_grad(): _, logits = model(src_batch, None, seg_batch) pred = torch.argmax(logits, dim=1) pred = pred.cpu().numpy().tolist() for j in range(len(pred)): results.append((examples[index][-2], logits[index].cpu().numpy())) index += 1 results_final.extend(postprocess_chid_predictions(results)) with open(args.prediction_path, 'w') as f: json.dump({tag: pred for tag, pred in results_final}, f, indent=2)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) finetune_opts(parser) parser.add_argument("--train_answer_path", type=str, required=True, help="Path of the answers for trainset.") parser.add_argument("--dev_answer_path", type=str, required=True, help="Path of the answers for devset.") parser.add_argument( "--max_choices_num", default=10, type=int, help= "The maximum number of cadicate answer, shorter than this will be padded." ) args = parser.parse_args() args.labels_num = args.max_choices_num if args.output_model_path == None: args.output_model_path = "./models/chid_model.bin" # Load the hyperparameters from the config file. args = load_hyperparam(args) set_seed(args.seed) # Build tokenizer. args.tokenizer = CharTokenizer(args) # Build multiple choice model. model = MultipleChoice(args) # Load or initialize parameters. load_or_initialize_parameters(args, model) args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(args.device) # Training phase. trainset = read_dataset(args, args.train_path, args.train_answer_path) random.shuffle(trainset) instances_num = len(trainset) batch_size = args.batch_size src = torch.LongTensor([example[0] for example in trainset]) tgt = torch.LongTensor([example[1] for example in trainset]) seg = torch.LongTensor([example[2] for example in trainset]) args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1 print("Batch size: ", batch_size) print("The number of training instances:", instances_num) optimizer, scheduler = build_optimizer(args, model) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) args.amp = amp if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) args.model = model total_loss, result, best_result = 0., 0., 0. print("Start training.") for epoch in range(1, args.epochs_num + 1): model.train() for i, (src_batch, tgt_batch, seg_batch, _) in enumerate(batch_loader(batch_size, src, tgt, seg)): loss = train_model(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_batch) total_loss += loss.item() if (i + 1) % args.report_steps == 0: print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}". format(epoch, i + 1, total_loss / args.report_steps)) total_loss = 0. result = evaluate( args, read_dataset(args, args.dev_path, args.dev_answer_path)) if result[0] > best_result: best_result = result[0] save_model(model, args.output_model_path)
parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) infer_opts(parser) parser.add_argument("--pooling", choices=["first", "last", "max", "mean"], \ default="first", help="Pooling Type.") parser.add_argument("--whitening_size", type=int, default=None, help="Output vector size after whitening.") tokenizer_opts(parser) args = parser.parse_args() args = load_hyperparam(args) args.tokenizer = str2tokenizer[args.tokenizer](args) # Build feature extractor model. model = FeatureExtractor(args) model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = nn.DataParallel(model) model.eval()
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) finetune_opts(parser) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) set_seed(args.seed) # Build tokenizer. args.tokenizer = CharTokenizer(args) # Build machine reading comprehension model. model = MachineReadingComprehension(args) # Load or initialize parameters. load_or_initialize_parameters(args, model) args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(args.device) # Build tokenizer. args.tokenizer = CharTokenizer(args) # Training phase. batch_size = args.batch_size print("Batch size: ", batch_size) trainset, _ = read_dataset(args, args.train_path) random.shuffle(trainset) instances_num = len(trainset) src = torch.LongTensor([sample[0] for sample in trainset]) seg = torch.LongTensor([sample[1] for sample in trainset]) start_position = torch.LongTensor([sample[2] for sample in trainset]) end_position = torch.LongTensor([sample[3] for sample in trainset]) args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1 print("The number of training instances:", instances_num) optimizer, scheduler = build_optimizer(args, model) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) args.model = model total_loss = 0.0 result = 0.0 best_result = 0.0 print("Start training.") for epoch in range(1, args.epochs_num + 1): model.train() for i, (src_batch, seg_batch, start_position_batch, end_position_batch) in enumerate( batch_loader(batch_size, src, seg, start_position, end_position)): loss = train(args, model, optimizer, scheduler, src_batch, seg_batch, start_position_batch, end_position_batch) total_loss += loss.item() if (i + 1) % args.report_steps == 0: print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}". format(epoch, i + 1, total_loss / args.report_steps)) total_loss = 0.0 result = evaluate(args, *read_dataset(args, args.dev_path)) if result > best_result: best_result = result save_model(model, args.output_model_path) # Evaluation phase. if args.test_path is not None: print("Test set evaluation.") if torch.cuda.device_count() > 1: model.module.load_state_dict(torch.load(args.output_model_path)) else: model.load_state_dict(torch.load(args.output_model_path)) evaluate(args, *read_dataset(args, args.test_path))
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--pretrained_model_path", default=None, type=str, help="Path of the pretrained model.") parser.add_argument("--output_model_path", default="./models/classifier_model.bin", type=str, help="Path of the output model.") parser.add_argument("--vocab_path", default="./models/google_vocab.txt", type=str, help="Path of the vocabulary file.") parser.add_argument("--train_path", type=str, required=True, help="Path of the trainset.") parser.add_argument("--dev_path", type=str, required=True, help="Path of the devset.") parser.add_argument("--test_path", type=str, help="Path of the testset.") parser.add_argument("--config_path", default="./models/bert_base_config.json", type=str, help="Path of the config file.") # Model options. parser.add_argument("--batch_size", type=int, default=64, help="Batch size.") parser.add_argument("--seq_length", type=int, default=128, help="Sequence length.") parser.add_argument("--embedding", choices=["bert", "word"], default="bert", help="Emebdding type.") parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \ "cnn", "gatedcnn", "attn", \ "rcnn", "crnn", "gpt", "bilstm"], \ default="bert", help="Encoder type.") parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.") parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first", help="Pooling type.") # Subword options. parser.add_argument("--subword_type", choices=["none", "char"], default="none", help="Subword feature type.") parser.add_argument("--sub_vocab_path", type=str, default="models/sub_vocab.txt", help="Path of the subword vocabulary file.") parser.add_argument("--subencoder", choices=["avg", "lstm", "gru", "cnn"], default="avg", help="Subencoder type.") parser.add_argument("--sub_layers_num", type=int, default=2, help="The number of subencoder layers.") # Tokenizer options. parser.add_argument( "--tokenizer", choices=["bert", "char", "space"], default="bert", help="Specify the tokenizer." "Original Google BERT uses bert tokenizer on Chinese corpus." "Char tokenizer segments sentences into characters." "Space tokenizer segments sentences into words according to space.") # Optimizer options. parser.add_argument("--learning_rate", type=float, default=2e-5, help="Learning rate.") parser.add_argument("--warmup", type=float, default=0.1, help="Warm up value.") # Training options. parser.add_argument("--dropout", type=float, default=0.5, help="Dropout.") parser.add_argument("--epochs_num", type=int, default=3, help="Number of epochs.") parser.add_argument("--report_steps", type=int, default=100, help="Specific steps to print prompt.") parser.add_argument("--seed", type=int, default=7, help="Random seed.") # Evaluation options. parser.add_argument("--mean_reciprocal_rank", action="store_true", help="Evaluation metrics for DBQA dataset.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) set_seed(args.seed) # Count the number of labels. labels_set = set() columns = {} with open(args.train_path, mode="r", encoding="utf-8") as f: for line_id, line in enumerate(f): try: line = line.strip().split("\t") if line_id == 0: for i, column_name in enumerate(line): columns[column_name] = i continue label = int(line[columns["label"]]) labels_set.add(label) except: pass args.labels_num = len(labels_set) # Load vocabulary. vocab = Vocab() vocab.load(args.vocab_path) args.vocab = vocab # Build bert model. # A pseudo target is added. args.target = "bert" model = build_model(args) # Load or initialize parameters. if args.pretrained_model_path is not None: # Initialize with pretrained model. model.load_state_dict(torch.load(args.pretrained_model_path), strict=False) else: # Initialize with normal distribution. for n, p in list(model.named_parameters()): if 'gamma' not in n and 'beta' not in n: p.data.normal_(0, 0.02) # Build classification model. model = BertClassifier(args, model) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = nn.DataParallel(model) model = model.to(device) # Datset loader. def batch_loader(batch_size, input_ids, label_ids, mask_ids): instances_num = input_ids.size()[0] for i in range(instances_num // batch_size): input_ids_batch = input_ids[i * batch_size:(i + 1) * batch_size, :] label_ids_batch = label_ids[i * batch_size:(i + 1) * batch_size] mask_ids_batch = mask_ids[i * batch_size:(i + 1) * batch_size, :] yield input_ids_batch, label_ids_batch, mask_ids_batch if instances_num > instances_num // batch_size * batch_size: input_ids_batch = input_ids[instances_num // batch_size * batch_size:, :] label_ids_batch = label_ids[instances_num // batch_size * batch_size:] mask_ids_batch = mask_ids[instances_num // batch_size * batch_size:, :] yield input_ids_batch, label_ids_batch, mask_ids_batch # Build tokenizer. tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args) # Read dataset. def read_dataset(path): dataset = [] with open(path, mode="r", encoding="utf-8") as f: for line_id, line in enumerate(f): if line_id == 0: continue try: line = line.strip().split('\t') if len(line) == 2: label = int(line[columns["label"]]) text = line[columns["text_a"]] tokens = [ vocab.get(t) for t in tokenizer.tokenize(text) ] tokens = [CLS_ID] + tokens mask = [1] * len(tokens) if len(tokens) > args.seq_length: tokens = tokens[:args.seq_length] mask = mask[:args.seq_length] while len(tokens) < args.seq_length: tokens.append(0) mask.append(0) dataset.append((tokens, label, mask)) elif len(line) == 3: # For sentence pair input. label = int(line[columns["label"]]) text_a, text_b = line[columns["text_a"]], line[ columns["text_b"]] tokens_a = [ vocab.get(t) for t in tokenizer.tokenize(text_a) ] tokens_a = [CLS_ID] + tokens_a + [SEP_ID] tokens_b = [ vocab.get(t) for t in tokenizer.tokenize(text_b) ] tokens_b = tokens_b + [SEP_ID] tokens = tokens_a + tokens_b mask = [1] * len(tokens_a) + [2] * len(tokens_b) if len(tokens) > args.seq_length: tokens = tokens[:args.seq_length] mask = mask[:args.seq_length] while len(tokens) < args.seq_length: tokens.append(0) mask.append(0) dataset.append((tokens, label, mask)) elif len(line) == 4: # For dbqa input. qid = int(line[columns["qid"]]) label = int(line[columns["label"]]) text_a, text_b = line[columns["text_a"]], line[ columns["text_b"]] tokens_a = [ vocab.get(t) for t in tokenizer.tokenize(text_a) ] tokens_a = [CLS_ID] + tokens_a + [SEP_ID] tokens_b = [ vocab.get(t) for t in tokenizer.tokenize(text_b) ] tokens_b = tokens_b + [SEP_ID] tokens = tokens_a + tokens_b mask = [1] * len(tokens_a) + [2] * len(tokens_b) if len(tokens) > args.seq_length: tokens = tokens[:args.seq_length] mask = mask[:args.seq_length] while len(tokens) < args.seq_length: tokens.append(0) mask.append(0) dataset.append((tokens, label, mask, qid)) else: pass except: pass return dataset # Evaluation function. def evaluate(args, is_test): if is_test: dataset = read_dataset(args.test_path) else: dataset = read_dataset(args.dev_path) input_ids = torch.LongTensor([sample[0] for sample in dataset]) label_ids = torch.LongTensor([sample[1] for sample in dataset]) mask_ids = torch.LongTensor([sample[2] for sample in dataset]) batch_size = args.batch_size instances_num = input_ids.size()[0] if is_test: print("The number of evaluation instances: ", instances_num) correct = 0 # Confusion matrix. confusion = torch.zeros(args.labels_num, args.labels_num, dtype=torch.long) model.eval() if not args.mean_reciprocal_rank: for i, (input_ids_batch, label_ids_batch, mask_ids_batch) in enumerate( batch_loader(batch_size, input_ids, label_ids, mask_ids)): input_ids_batch = input_ids_batch.to(device) label_ids_batch = label_ids_batch.to(device) mask_ids_batch = mask_ids_batch.to(device) with torch.no_grad(): loss, logits = model(input_ids_batch, label_ids_batch, mask_ids_batch) logits = nn.Softmax(dim=1)(logits) pred = torch.argmax(logits, dim=1) gold = label_ids_batch for j in range(pred.size()[0]): confusion[pred[j], gold[j]] += 1 correct += torch.sum(pred == gold).item() if is_test: print("Confusion matrix:") print(confusion) print("Report precision, recall, and f1:") for i in range(confusion.size()[0]): p = confusion[i, i].item() / confusion[i, :].sum().item() r = confusion[i, i].item() / confusion[:, i].sum().item() f1 = 2 * p * r / (p + r) if is_test: print("Label {}: {:.3f}, {:.3f}, {:.3f}".format( i, p, r, f1)) print("Acc. (Correct/Total): {:.4f} ({}/{}) ".format( correct / len(dataset), correct, len(dataset))) return correct / len(dataset) else: for i, (input_ids_batch, label_ids_batch, mask_ids_batch) in enumerate( batch_loader(batch_size, input_ids, label_ids, mask_ids)): input_ids_batch = input_ids_batch.to(device) label_ids_batch = label_ids_batch.to(device) mask_ids_batch = mask_ids_batch.to(device) with torch.no_grad(): loss, logits = model(input_ids_batch, label_ids_batch, mask_ids_batch) logits = nn.Softmax(dim=1)(logits) if i == 0: logits_all = logits if i >= 1: logits_all = torch.cat((logits_all, logits), 0) order = -1 gold = [] for i in range(len(dataset)): qid = dataset[i][3] label = dataset[i][1] if qid == order: j += 1 if label == 1: gold.append((qid, j)) else: order = qid j = 0 if label == 1: gold.append((qid, j)) label_order = [] order = -1 for i in range(len(gold)): if gold[i][0] == order: templist.append(gold[i][1]) elif gold[i][0] != order: order = gold[i][0] if i > 0: label_order.append(templist) templist = [] templist.append(gold[i][1]) label_order.append(templist) order = -1 score_list = [] for i in range(len(logits_all)): score = float(logits_all[i][1]) qid = int(dataset[i][3]) if qid == order: templist.append(score) else: order = qid if i > 0: score_list.append(templist) templist = [] templist.append(score) score_list.append(templist) rank = [] pred = [] for i in range(len(score_list)): if len(label_order[i]) == 1: if label_order[i][0] < len(score_list[i]): true_score = score_list[i][label_order[i][0]] score_list[i].sort(reverse=True) for j in range(len(score_list[i])): if score_list[i][j] == true_score: rank.append(1 / (j + 1)) else: rank.append(0) else: true_rank = len(score_list[i]) for k in range(len(label_order[i])): if label_order[i][k] < len(score_list[i]): true_score = score_list[i][label_order[i][k]] temp = sorted(score_list[i], reverse=True) for j in range(len(temp)): if temp[j] == true_score: if j < true_rank: true_rank = j if true_rank < len(score_list[i]): rank.append(1 / (true_rank + 1)) else: rank.append(0) MRR = sum(rank) / len(rank) print(MRR) return MRR # Training phase. print("Start training.") trainset = read_dataset(args.train_path) random.shuffle(trainset) instances_num = len(trainset) batch_size = args.batch_size input_ids = torch.LongTensor([example[0] for example in trainset]) label_ids = torch.LongTensor([example[1] for example in trainset]) mask_ids = torch.LongTensor([example[2] for example in trainset]) train_steps = int(instances_num * args.epochs_num / batch_size) + 1 print("Batch size: ", batch_size) print("The number of training instances:", instances_num) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup, t_total=train_steps) total_loss = 0. result = 0.0 best_result = 0.0 for epoch in range(1, args.epochs_num + 1): model.train() for i, (input_ids_batch, label_ids_batch, mask_ids_batch) in enumerate( batch_loader(batch_size, input_ids, label_ids, mask_ids)): model.zero_grad() input_ids_batch = input_ids_batch.to(device) label_ids_batch = label_ids_batch.to(device) mask_ids_batch = mask_ids_batch.to(device) loss, _ = model(input_ids_batch, label_ids_batch, mask_ids_batch) if torch.cuda.device_count() > 1: loss = torch.mean(loss) total_loss += loss.item() if (i + 1) % args.report_steps == 0: print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}". format(epoch, i + 1, total_loss / args.report_steps)) total_loss = 0. loss.backward() optimizer.step() result = evaluate(args, False) if result > best_result: best_result = result save_model(model, args.output_model_path) else: continue # Evaluation phase. if args.test_path is not None: print("Test set evaluation.") model = load_model(model, args.output_model_path) evaluate(args, True)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--load_model_path", default=None, type=str, help="Path of the multiple choice model.") parser.add_argument("--vocab_path", type=str, required=True, help="Path of the vocabulary file.") parser.add_argument("--spm_model_path", default=None, type=str, help="Path of the sentence piece model.") parser.add_argument("--test_path", type=str, help="Path of the testset.") parser.add_argument("--prediction_path", default=None, type=str, help="Path of the prediction file.") parser.add_argument("--config_path", default="./models/bert_base_config.json", type=str, help="Path of the config file.") # Model options. parser.add_argument("--batch_size", type=int, default=32, help="Batch size.") parser.add_argument("--seq_length", type=int, default=64, help="Sequence length.") parser.add_argument( "--max_choices_num", default=10, type=int, help= "The maximum number of cadicate answer, shorter than this will be padded." ) parser.add_argument("--embedding", choices=["bert", "word"], default="bert", help="Emebdding type.") parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \ "cnn", "gatedcnn", "attn", "synt", \ "rcnn", "crnn", "gpt", "bilstm"], \ default="bert", help="Encoder type.") parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.") parser.add_argument("--factorized_embedding_parameterization", action="store_true", help="Factorized embedding parameterization.") parser.add_argument("--parameter_sharing", action="store_true", help="Parameter sharing.") # Tokenizer options. parser.add_argument( "--tokenizer", choices=["bert", "char", "space"], default="char", help="Specify the tokenizer." "Original Google BERT uses bert tokenizer on Chinese corpus." "Char tokenizer segments sentences into characters." "Space tokenizer segments sentences into words according to space.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args) # Build classification model and load parameters. model = MultipleChoice(args) model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) dataset = read_dataset(args, args.test_path, None) model.eval() batch_size = args.batch_size results_final = [] dataset_by_group = {} print("The number of prediction instances: ", len(dataset)) for example in dataset: if example[-1] not in dataset_by_group: dataset_by_group[example[-1]] = [example] else: dataset_by_group[example[-1]].append(example) for group_index, examples in dataset_by_group.items(): src = torch.LongTensor([example[0] for example in examples]) tgt = torch.LongTensor([example[1] for example in examples]) seg = torch.LongTensor([example[2] for example in examples]) index = 0 results = [] for i, (src_batch, _, seg_batch, _) in enumerate(batch_loader(batch_size, src, tgt, seg)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) with torch.no_grad(): _, logits = model(src_batch, None, seg_batch) pred = torch.argmax(logits, dim=1) pred = pred.cpu().numpy().tolist() for j in range(len(pred)): results.append( (examples[index][-2], logits[index].cpu().numpy())) index += 1 results_final.extend(postprocess_chid_predictions(results)) with open(args.prediction_path, 'w') as f: json.dump({tag: pred for tag, pred in results_final}, f, indent=2)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) finetune_opts(parser) parser.add_argument("--label2id_path", type=str, required=True, help="Path of the label2id file.") args = parser.parse_args() # Load the hyperparameters of the config file. args = load_hyperparam(args) set_seed(args.seed) args.begin_ids = [] with open(args.label2id_path, mode="r", encoding="utf-8") as f: l2i = json.load(f) print("Labels: ", l2i) l2i["[PAD]"] = len(l2i) for label in l2i: if label.startswith("B"): args.begin_ids.append(l2i[label]) args.l2i = l2i args.labels_num = len(l2i) args.tokenizer = SpaceTokenizer(args) # Build sequence labeling model. model = NerTagger(args) # Load or initialize parameters. load_or_initialize_parameters(args, model) args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(args.device) # Training phase. instances = read_dataset(args, args.train_path) src = torch.LongTensor([ins[0] for ins in instances]) tgt = torch.LongTensor([ins[1] for ins in instances]) seg = torch.LongTensor([ins[2] for ins in instances]) instances_num = src.size(0) batch_size = args.batch_size args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1 print("Batch size: ", batch_size) print("The number of training instances:", instances_num) optimizer, scheduler = build_optimizer(args, model) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) args.model = model total_loss, f1, best_f1 = 0.0, 0.0, 0.0 print("Start training.") for epoch in range(1, args.epochs_num + 1): model.train() for i, (src_batch, tgt_batch, seg_batch) in enumerate(batch_loader(batch_size, src, tgt, seg)): loss = train(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_batch) total_loss += loss.item() if (i + 1) % args.report_steps == 0: print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}". format(epoch, i + 1, total_loss / args.report_steps)) total_loss = 0.0 f1 = evaluate(args, read_dataset(args, args.dev_path)) if f1 > best_f1: best_f1 = f1 save_model(model, args.output_model_path) else: continue # Evaluation phase. if args.test_path is not None: print("Test set evaluation.") if torch.cuda.device_count() > 1: args.model.module.load_state_dict( torch.load(args.output_model_path)) else: args.model.load_state_dict(torch.load(args.output_model_path)) evaluate(args, read_dataset(args, args.test_path))
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--pretrained_model_path", default=None, type=str, help="Path of the pretrained model.") parser.add_argument("--output_model_path", default="./models/classifier_model.bin", type=str, help="Path of the output model.") parser.add_argument("--vocab_path", default=None, type=str, help="Path of the vocabulary file.") parser.add_argument("--spm_model_path", default=None, type=str, help="Path of the sentence piece model.") parser.add_argument("--train_path", type=str, required=True, help="Path of the trainset.") parser.add_argument("--dev_path", type=str, required=True, help="Path of the devset.") parser.add_argument("--test_path", type=str, help="Path of the testset.") parser.add_argument("--config_path", default="./models/bert_base_config.json", type=str, help="Path of the config file.") # Model options. parser.add_argument("--batch_size", type=int, default=64, help="Batch size.") parser.add_argument("--seq_length", type=int, default=128, help="Sequence length.") parser.add_argument("--embedding", choices=["bert", "word"], default="bert", help="Emebdding type.") parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \ "cnn", "gatedcnn", "attn", "synt", \ "rcnn", "crnn", "gpt", "bilstm"], \ default="bert", help="Encoder type.") parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.") parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first", help="Pooling type.") parser.add_argument("--factorized_embedding_parameterization", action="store_true", help="Factorized embedding parameterization.") parser.add_argument("--parameter_sharing", action="store_true", help="Parameter sharing.") # Tokenizer options. parser.add_argument("--tokenizer", choices=["bert", "char", "space"], default="bert", help="Specify the tokenizer." "Original Google BERT uses bert tokenizer on Chinese corpus." "Char tokenizer segments sentences into characters." "Space tokenizer segments sentences into words according to space." ) # Optimizer options. parser.add_argument("--soft_targets", action='store_true', help="Train model with logits.") parser.add_argument("--soft_alpha", type=float, default=0.5, help="Weight of the soft targets loss.") parser.add_argument("--learning_rate", type=float, default=2e-5, help="Learning rate.") parser.add_argument("--warmup", type=float, default=0.1, help="Warm up value.") parser.add_argument("--fp16", action='store_true', help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit.") parser.add_argument("--fp16_opt_level", choices=["O0", "O1", "O2", "O3" ], default='O1', help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") # Training options. parser.add_argument("--dropout", type=float, default=0.5, help="Dropout.") parser.add_argument("--epochs_num", type=int, default=3, help="Number of epochs.") parser.add_argument("--report_steps", type=int, default=100, help="Specific steps to print prompt.") parser.add_argument("--seed", type=int, default=7, help="Random seed.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) set_seed(args.seed) # Count the number of labels. args.labels_num = count_labels_num(args.train_path) # Build tokenizer. args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args) # Build classification model. model = Classifier(args) # Load or initialize parameters. load_or_initialize_parameters(args, model) args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(args.device) # Training phase. trainset = read_dataset(args, args.train_path) random.shuffle(trainset) instances_num = len(trainset) batch_size = args.batch_size src = torch.LongTensor([example[0] for example in trainset]) tgt = torch.LongTensor([example[1] for example in trainset]) seg = torch.LongTensor([example[2] for example in trainset]) if args.soft_targets: soft_tgt = torch.FloatTensor([example[3] for example in trainset]) else: soft_tgt = None args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1 print("Batch size: ", batch_size) print("The number of training instances:", instances_num) optimizer, scheduler = build_optimizer(args, model) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level = args.fp16_opt_level) args.amp = amp if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format(torch.cuda.device_count())) model = torch.nn.DataParallel(model) args.model = model total_loss, result, best_result = 0., 0., 0. print("Start training.") for epoch in range(1, args.epochs_num+1): model.train() for i, (src_batch, tgt_batch, seg_batch, soft_tgt_batch) in enumerate(batch_loader(batch_size, src, tgt, seg, soft_tgt)): loss = train_model(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_batch, soft_tgt_batch) total_loss += loss.item() if (i + 1) % args.report_steps == 0: print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".format(epoch, i+1, total_loss / args.report_steps)) total_loss = 0. #change by qyl import os base_dir=args.output_model_path.split('/')[0] if not os.path.exists(base_dir): os.makedirs(base_dir) result = evaluate(args, read_dataset(args, args.dev_path)) save_model(model, base_dir+'/epoch_'+str(epoch)+'.bin') if result[0] > best_result: best_result = result[0] save_model(model, args.output_model_path) # Evaluation phase. if args.test_path is not None: print("Test set evaluation.") if torch.cuda.device_count() > 1: model.module.load_state_dict(torch.load(args.output_model_path)) else: model.load_state_dict(torch.load(args.output_model_path)) evaluate(args, read_dataset(args, args.test_path), True)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--dataset_path", type=str, default="dataset.pt", help="Path of the preprocessed dataset.") parser.add_argument("--vocab_path", default=None, type=str, help="Path of the vocabulary file.") parser.add_argument("--spm_model_path", default=None, type=str, help="Path of the sentence piece model.") parser.add_argument("--pretrained_model_path", type=str, default=None, help="Path of the pretrained model.") parser.add_argument("--output_model_path", type=str, required=True, help="Path of the output model.") parser.add_argument("--config_path", type=str, default="models/bert_base_config.json", help="Config file of model hyper-parameters.") # Training and saving options. parser.add_argument("--total_steps", type=int, default=100000, help="Total training steps.") parser.add_argument("--save_checkpoint_steps", type=int, default=10000, help="Specific steps to save model checkpoint.") parser.add_argument("--report_steps", type=int, default=100, help="Specific steps to print prompt.") parser.add_argument("--accumulation_steps", type=int, default=1, help="Specific steps to accumulate gradient.") parser.add_argument( "--batch_size", type=int, default=32, help= "Training batch size. The actual batch_size is [batch_size x world_size x accumulation_steps]." ) parser.add_argument("--instances_buffer_size", type=int, default=25600, help="The buffer size of instances in memory.") # Model options. parser.add_argument("--dropout", type=float, default=0.1, help="Dropout value.") parser.add_argument("--seed", type=int, default=7, help="Random seed.") parser.add_argument("--embedding", choices=["bert", "word", "gpt"], default="bert", help="Emebdding type.") parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \ "cnn", "gatedcnn", "attn", "synt", \ "rcnn", "crnn", "gpt", "gpt2", "bilstm"], \ default="bert", help="Encoder type.") parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.") parser.add_argument("--target", choices=["bert", "lm", "cls", "mlm", "bilm", "albert"], default="bert", help="The training target of the pretraining model.") parser.add_argument("--tie_weights", action="store_true", help="Tie the word embedding and softmax weights.") parser.add_argument("--factorized_embedding_parameterization", action="store_true", help="Factorized embedding parameterization.") parser.add_argument("--has_lmtarget_bias", action="store_true", help="Add bias on output_layer for lm target.") parser.add_argument("--parameter_sharing", action="store_true", help="Parameter sharing.") # Masking options. parser.add_argument("--span_masking", action="store_true", help="Span masking.") parser.add_argument( "--span_geo_prob", type=float, default=0.2, help="Hyperparameter of geometric distribution for span masking.") parser.add_argument("--span_max_length", type=int, default=10, help="Max length for span masking.") # Optimizer options. parser.add_argument("--learning_rate", type=float, default=2e-5, help="Initial learning rate.") parser.add_argument("--warmup", type=float, default=0.1, help="Warm up value.") parser.add_argument("--beta1", type=float, default=0.9, help="Beta1 for Adam optimizer.") parser.add_argument("--beta2", type=float, default=0.999, help="Beta2 for Adam optimizer.") parser.add_argument( "--fp16", action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( "--fp16_opt_level", choices=["O0", "O1", "O2", "O3"], default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") # GPU options. parser.add_argument("--world_size", type=int, default=1, help="Total number of processes (GPUs) for training.") parser.add_argument( "--gpu_ranks", default=[], nargs='+', type=int, help="List of ranks of each process." " Each process has a unique integer rank whose value is in the interval [0, world_size), and runs in a single GPU." ) parser.add_argument("--master_ip", default="tcp://localhost:12345", type=str, help="IP-Port of master for training.") parser.add_argument("--backend", choices=["nccl", "gloo"], default="nccl", type=str, help="Distributed backend.") args = parser.parse_args() # Load hyper-parameters from config file. if args.config_path: load_hyperparam(args) ranks_num = len(args.gpu_ranks) if args.world_size > 1: # Multiprocessing distributed mode. assert torch.cuda.is_available(), "No available GPUs." assert ranks_num <= args.world_size, "Started processes exceed `world_size` upper limit." assert ranks_num <= torch.cuda.device_count( ), "Started processes exceeds the available GPUs." args.dist_train = True args.ranks_num = ranks_num print("Using distributed mode for training.") elif args.world_size == 1 and ranks_num == 1: # Single GPU mode. assert torch.cuda.is_available(), "No available GPUs." args.gpu_id = args.gpu_ranks[0] assert args.gpu_id < torch.cuda.device_count( ), "Invalid specified GPU device." args.dist_train = False args.single_gpu = True print("Using GPU %d for training." % args.gpu_id) else: # CPU mode. assert ranks_num == 0, "GPUs are specified, please check the arguments." args.dist_train = False args.single_gpu = False print("Using CPU mode for training.") trainer.train_and_validate(args)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--pretrained_model_path", default=None, type=str, help="Path of the pretrained model.") parser.add_argument("--output_model_path", default="./models/tagger_model.bin", type=str, help="Path of the output model.") parser.add_argument("--vocab_path", default="./models/google_vocab.txt", type=str, help="Path of the vocabulary file.") parser.add_argument("--train_path", type=str, required=True, help="Path of the trainset.") parser.add_argument("--dev_path", type=str, required=True, help="Path of the devset.") parser.add_argument("--test_path", type=str, required=True, help="Path of the testset.") parser.add_argument("--config_path", default="./models/google_config.json", type=str, help="Path of the config file.") # Model options parser.add_argument("--batch_size", type=int, default=32, help="Batch_size.") parser.add_argument("--seq_length", default=128, type=int, help="Sequence length.") parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \ "cnn", "gatedcnn", "attn", \ "rcnn", "crnn", "gpt", "bilstm"], \ default="bert", help="Encoder type.") parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.") # Subword options. parser.add_argument("--subword_type", choices=["none", "char"], default="none", help="Subword feature type.") parser.add_argument("--sub_vocab_path", type=str, default="models/sub_vocab.txt", help="Path of the subword vocabulary file.") parser.add_argument("--subencoder", choices=["avg", "lstm", "gru", "cnn"], default="avg", help="Subencoder type.") parser.add_argument("--sub_layers_num", type=int, default=2, help="The number of subencoder layers.") # Optimizer options. parser.add_argument("--learning_rate", type=float, default=2e-5, help="Learning rate.") parser.add_argument("--warmup", type=float, default=0.1, help="Warm up value.") # Training options. parser.add_argument("--dropout", type=float, default=0.1, help="Dropout.") parser.add_argument("--epochs_num", type=int, default=3, help="Number of epochs.") parser.add_argument("--report_steps", type=int, default=100, help="Specific steps to print prompt.") parser.add_argument("--seed", type=int, default=7, help="Random seed.") args = parser.parse_args() # Load the hyperparameters of the config file. args = load_hyperparam(args) set_seed(args.seed) # Find tagging labels. labels_map = {"NULL": 0, "O": 1} # ID for padding and non-entity. with open(args.train_path, mode="r", encoding="utf-8") as f: for line_id, line in enumerate(f): if line_id == 0: continue line = line.strip().split() if len(line) != 2: continue if line[1] not in labels_map: labels_map[line[1]] = len(labels_map) print("Labels: ", labels_map) print("Label Num: ", len(labels_map)) args.labels_num = len(labels_map) # Create the bad pairs args.bad_pairs = [] args.good_pairs = [] for key1, value1 in labels_map.items(): key1 = key1.strip().split('-') if len(key1) < 1 or len(key1) > 2: print("Error label: ", key1) exit() for key2, value2 in labels_map.items(): key2 = key2.strip().split('-') if len(key2) == 1: continue if len(key1) == 1 and len(key2) == 2: if key2[0] == 'I': args.bad_pairs.append([value1, value2]) continue # p(B-X -> I-Y) = 0 if key1[1] != key2[1] and key1[0] == 'B' and key2[0] == 'I': args.bad_pairs.append([value1, value2]) # p(I-X -> I-Y) = 0 if key1[1] != key2[1] and key1[0] == 'I' and key2[0] == 'I': args.bad_pairs.append([value1, value2]) # p(B-X -> I-X) = 10 if key1[1] == key2[1] and key1[0] == 'B' and key2[0] == 'I': args.good_pairs.append([value1, value2]) print("Bad pairs: ", args.bad_pairs) print("Good pairs: ", args.good_pairs) # Load vocabulary. vocab = Vocab() vocab.load(args.vocab_path) args.vocab = vocab # Build bert model. # A pseudo target is added. args.target = "bert" model = build_model(args) # Load or initialize parameters. if args.pretrained_model_path is not None: # Initialize with pretrained model. model.load_state_dict(torch.load(args.pretrained_model_path), strict=False) else: # Initialize with normal distribution. for n, p in list(model.named_parameters()): if 'gamma' not in n and 'beta' not in n: p.data.normal_(0, 0.02) # Some other parameters args.lstm_hidden = args.hidden_size args.lstm_layers = 2 args.lstm_dropout = 0.1 if torch.cuda.is_available(): args.use_cuda = True # Build sequence labeling model. model = CCKSTagger(args, model) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = nn.DataParallel(model) model = model.module model = model.to(device) # Datset loader. def batch_loader(batch_size, input_ids, label_ids, mask_ids): instances_num = input_ids.size()[0] for i in range(instances_num // batch_size): input_ids_batch = input_ids[i * batch_size:(i + 1) * batch_size, :] label_ids_batch = label_ids[i * batch_size:(i + 1) * batch_size, :] mask_ids_batch = mask_ids[i * batch_size:(i + 1) * batch_size, :] yield input_ids_batch, label_ids_batch, mask_ids_batch if instances_num > instances_num // batch_size * batch_size: input_ids_batch = input_ids[instances_num // batch_size * batch_size:, :] label_ids_batch = label_ids[instances_num // batch_size * batch_size:, :] mask_ids_batch = mask_ids[instances_num // batch_size * batch_size:, :] yield input_ids_batch, label_ids_batch, mask_ids_batch # Read dataset. def read_dataset(path): dataset = [] with open(path, mode="r", encoding="utf-8") as f: tokens, labels = [], [] for line_id, line in enumerate(f): if line_id == 0: continue line = line.strip().split() if len(line) != 2: if len(labels) == 0: continue assert len(tokens) == len(labels) tokens = [vocab.get(t) for t in tokens] labels = [labels_map[l] for l in labels] mask = [1] * len(tokens) if len(tokens) > args.seq_length: tokens = tokens[:args.seq_length] labels = labels[:args.seq_length] mask = mask[:args.seq_length] while len(tokens) < args.seq_length: tokens.append(0) labels.append(0) mask.append(0) dataset.append([tokens, labels, mask]) tokens, labels = [], [] continue tokens.append(line[0]) labels.append(line[1]) return dataset # Evaluation function. def evaluate(args, is_test): if is_test: dataset = read_dataset(args.test_path) else: dataset = read_dataset(args.dev_path) input_ids = torch.LongTensor([sample[0] for sample in dataset]) label_ids = torch.LongTensor([sample[1] for sample in dataset]) mask_ids = torch.LongTensor([sample[2] for sample in dataset]) instances_num = input_ids.size(0) batch_size = args.batch_size if is_test: print("Batch size: ", batch_size) print("The number of test instances:", instances_num) correct = 0 gold_entities_num = 0 pred_entities_num = 0 confusion = torch.zeros(len(labels_map), len(labels_map), dtype=torch.long) model.eval() for i, (input_ids_batch, label_ids_batch, mask_ids_batch) in enumerate( batch_loader(batch_size, input_ids, label_ids, mask_ids)): input_ids_batch = input_ids_batch.to(device) label_ids_batch = label_ids_batch.to(device) mask_ids_batch = mask_ids_batch.to(device) # loss, _, pred, gold = model(input_ids_batch, label_ids_batch, mask_ids_batch) feats = model(input_ids_batch, label_ids_batch, mask_ids_batch) path_score, best_path = model.crf(feats, mask_ids_batch.byte()) pred = best_path.contiguous().view(-1) gold = label_ids_batch.contiguous().view(-1) """ if i == 0: print('pred', pred) print('gold', gold) """ # Gold. for j in range(gold.size()[0]): if (j > 0 and gold[j - 1].item() <= 1 and gold[j].item() > 1) or (j == 0 and gold[j].item() > 1): gold_entities_num += 1 # Predict. for j in range(pred.size()[0]): if (j > 0 and pred[j - 1].item() <= 1 and pred[j].item() > 1 and gold[j].item() != 0) or (j == 0 and pred[j].item() > 1): pred_entities_num += 1 pred_entities_pos = [] gold_entities_pos = [] start, end = 0, 0 # Correct. for j in range(gold.size()[0]): if (j > 0 and gold[j - 1].item() <= 1 and gold[j].item() > 1) or (j == 0 and gold[j].item() > 1): start = j for k in range(j, gold.size()[0]): if gold[k].item() <= 1: end = k - 1 break else: end = gold.size()[0] - 1 gold_entities_pos.append((start, end)) # Predict. for j in range(pred.size()[0]): if (j > 0 and pred[j - 1].item() <= 1 and pred[j].item() > 1) or (j == 0 and pred[j].item() > 1): start = j for k in range(j, pred.size()[0]): if pred[k].item() <= 1: end = k - 1 break else: end = pred.size()[0] - 1 pred_entities_pos.append((start, end)) for entity in pred_entities_pos: if entity not in gold_entities_pos: continue for j in range(entity[0], entity[1] + 1): if gold[j].item() != pred[j].item(): break else: correct += 1 print("Report precision, recall, and f1:") p = correct / pred_entities_num r = correct / gold_entities_num f1 = 2 * p * r / (p + r) print("{:.3f}, {:.3f}, {:.3f}".format(p, r, f1)) return f1 # Training phase. print("Start training.") instances = read_dataset(args.train_path) input_ids = torch.LongTensor([ins[0] for ins in instances]) label_ids = torch.LongTensor([ins[1] for ins in instances]) mask_ids = torch.LongTensor([ins[2] for ins in instances]) instances_num = input_ids.size(0) batch_size = args.batch_size train_steps = int(instances_num * args.epochs_num / batch_size) + 1 print("Batch size: ", batch_size) print("The number of training instances:", instances_num) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup, t_total=train_steps) total_loss = 0. f1 = 0.0 best_f1 = 0.0 for epoch in range(1, args.epochs_num + 1): model.train() for i, (input_ids_batch, label_ids_batch, mask_ids_batch) in enumerate( batch_loader(batch_size, input_ids, label_ids, mask_ids)): model.zero_grad() input_ids_batch = input_ids_batch.to(device) label_ids_batch = label_ids_batch.to(device) mask_ids_batch = mask_ids_batch.to(device) """ loss, _, _, _ = model(input_ids_batch, label_ids_batch, mask_ids_batch) if torch.cuda.device_count() > 1: loss = torch.mean(loss) total_loss += loss.item() if (i + 1) % args.report_steps == 0: print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".format(epoch, i+1, total_loss / args.report_steps)) total_loss = 0. """ """ print("mask1:", mask_ids_batch) print("label1:", label_ids_batch) """ feats = model(input_ids_batch, label_ids_batch, mask_ids_batch) """ print("feats:", feats) """ loss = model.loss(feats, mask_ids_batch, label_ids_batch) if (i + 1) % args.report_steps == 0: print("Epoch id: {}, Training steps: {}, Loss: {:.3f}".format( epoch, i + 1, loss)) loss.backward() optimizer.step() f1 = evaluate(args, False) if f1 > best_f1: best_f1 = f1 save_model(model, args.output_model_path) #else: # break # Evaluation phase. print("Start evaluation.") """ if torch.cuda.device_count() > 1: model.module.load_state_dict(torch.load(args.output_model_path)) else: model.load_state_dict(torch.load(args.output_model_path)) """ model.load_state_dict(torch.load(args.output_model_path)) evaluate(args, True) """
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--pretrained_model_path", default=None, type=str, help="Path of the pretrained model.") parser.add_argument("--output_model_path", default="./models/tagger_model.bin", type=str, help="Path of the output model.") parser.add_argument("--output_encoder", default="./luke-models/", type=str, help="Path of the output luke model.") parser.add_argument("--suffix_file_encoder", default="encoder", type=str, help="output file suffix luke model.") parser.add_argument("--vocab_path", default="./models/google_vocab.txt", type=str, help="Path of the vocabulary file.") parser.add_argument("--train_path", type=str, required=True, help="Path of the trainset.") parser.add_argument("--dev_path", type=str, required=True, help="Path of the devset.") parser.add_argument("--test_path", type=str, required=True, help="Path of the testset.") parser.add_argument("--config_path", default="./models/google_config.json", type=str, help="Path of the config file.") parser.add_argument("--output_file_prefix", type=str, required=True, help="Prefix for file output.") # Model options. parser.add_argument("--batch_size", type=int, default=2, help="Batch_size.") parser.add_argument("--seq_length", default=256, type=int, help="Sequence length.") parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \ "cnn", "gatedcnn", "attn", \ "rcnn", "crnn", "gpt", "bilstm"], \ default="bert", help="Encoder type.") parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.") # Subword options. parser.add_argument("--subword_type", choices=["none", "char"], default="none", help="Subword feature type.") parser.add_argument("--sub_vocab_path", type=str, default="models/sub_vocab.txt", help="Path of the subword vocabulary file.") parser.add_argument("--subencoder", choices=["avg", "lstm", "gru", "cnn"], default="avg", help="Subencoder type.") parser.add_argument("--sub_layers_num", type=int, default=2, help="The number of subencoder layers.") # Optimizer options. parser.add_argument("--learning_rate", type=float, default=2e-5, help="Learning rate.") parser.add_argument("--warmup", type=float, default=0.1, help="Warm up value.") # Training options. parser.add_argument("--dropout", type=float, default=0.1, help="Dropout.") parser.add_argument("--epochs_num", type=int, default=5, help="Number of epochs.") parser.add_argument("--report_steps", type=int, default=2, help="Specific steps to print prompt.") parser.add_argument("--seed", type=int, default=7, help="Random seed.") # kg parser.add_argument("--kg_name", required=True, help="KG name or path") parser.add_argument("--use_kg", action='store_true', help="Enable the use of KG.") parser.add_argument("--dry_run", action='store_true', help="Dry run to test the implementation.") parser.add_argument( "--voting_choicer", action='store_true', help="Enable the Voting choicer to select the entity type.") parser.add_argument("--eval_kg_tag", action='store_true', help="Enable to include [ENT] tag in evaluation.") parser.add_argument("--use_subword_tag", action='store_true', help="Enable to use separate tag for subword splits.") parser.add_argument("--debug", action='store_true', help="Enable debug.") parser.add_argument("--reverse_order", action='store_true', help="Reverse the feature selection order.") parser.add_argument("--max_entities", default=2, type=int, help="Number of KG features.") parser.add_argument("--eval_range_with_types", action='store_true', help="Enable to eval range with types.") args = parser.parse_args() # Load the hyperparameters of the config file. args = load_hyperparam(args) set_seed(args.seed) labels_map = {"[PAD]": 0, "[ENT]": 1, "[X]": 2, "[CLS]": 3, "[SEP]": 4} begin_ids = [] # Find tagging labels with open(args.train_path, mode="r", encoding="utf-8") as f: for line_id, line in enumerate(f): if line_id == 0: continue labels = line.strip().split("\t")[0].split() for l in labels: if l not in labels_map: if l.startswith("B") or l.startswith("S"): begin_ids.append(len(labels_map)) labels_map[l] = len(labels_map) idx_to_label = {labels_map[key]: key for key in labels_map} print(begin_ids) print("Labels: ", labels_map) args.labels_num = len(labels_map) # Build knowledge graph. if args.kg_name == 'none': kg_file = [] else: kg_file = args.kg_name # Load Luke model. model_archive = ModelArchive.load(args.pretrained_model_path) tokenizer = model_archive.tokenizer # Handling space character in roberta tokenizer byte_encoder = bytes_to_unicode() byte_decoder = {v: k for k, v in byte_encoder.items()} # Load the pretrained model encoder = LukeModel(model_archive.config) encoder.load_state_dict(model_archive.state_dict, strict=False) # Build sequence labeling model. model = LukeTagger(args, encoder) kg = KnowledgeGraph(kg_file=kg_file, tokenizer=tokenizer) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = nn.DataParallel(model) model = model.to(device) # Datset loader. def batch_loader(batch_size, input_ids, label_ids, mask_ids, pos_ids, vm_ids, tag_ids, segment_ids): instances_num = input_ids.size()[0] for i in range(instances_num // batch_size): input_ids_batch = input_ids[i * batch_size:(i + 1) * batch_size, :] label_ids_batch = label_ids[i * batch_size:(i + 1) * batch_size, :] mask_ids_batch = mask_ids[i * batch_size:(i + 1) * batch_size, :] pos_ids_batch = pos_ids[i * batch_size:(i + 1) * batch_size, :] vm_ids_batch = vm_ids[i * batch_size:(i + 1) * batch_size, :, :] tag_ids_batch = tag_ids[i * batch_size:(i + 1) * batch_size, :] segment_ids_batch = segment_ids[i * batch_size:(i + 1) * batch_size, :] yield input_ids_batch, label_ids_batch, mask_ids_batch, pos_ids_batch, vm_ids_batch, tag_ids_batch, segment_ids_batch if instances_num > instances_num // batch_size * batch_size: input_ids_batch = input_ids[instances_num // batch_size * batch_size:, :] label_ids_batch = label_ids[instances_num // batch_size * batch_size:, :] mask_ids_batch = mask_ids[instances_num // batch_size * batch_size:, :] pos_ids_batch = pos_ids[instances_num // batch_size * batch_size:, :] vm_ids_batch = vm_ids[instances_num // batch_size * batch_size:, :, :] tag_ids_batch = tag_ids[instances_num // batch_size * batch_size:, :] segment_ids_batch = segment_ids[instances_num // batch_size * batch_size:, :] yield input_ids_batch, label_ids_batch, mask_ids_batch, pos_ids_batch, vm_ids_batch, tag_ids_batch, segment_ids_batch # Read dataset. def read_dataset(path): dataset = [] count = 0 with open(path, mode="r", encoding="utf8") as f: f.readline() tokens, labels = [], [] for line_id, line in enumerate(f): fields = line.strip().split("\t") if len(fields) == 2: labels, tokens = fields elif len(fields) == 3: labels, tokens, cls = fields else: print( f'The data is not in accepted format at line no:{line_id}.. Ignored' ) continue tokens, pos, vm, tag = \ kg.add_knowledge_with_vm([tokens], [labels], use_kg=args.use_kg, max_length=args.seq_length, max_entities=args.max_entities, reverse_order=args.reverse_order) tokens = tokens[0] pos = pos[0] vm = vm[0].astype("bool") tag = tag[0] # tokens = tokenizer.convert_tokens_to_ids([tokenizer.cls_token] + tokens + [tokenizer.sep_token]) non_pad_tokens = [ tok for tok in tokens if tok != tokenizer.pad_token ] num_tokens = len(non_pad_tokens) num_pad = len(tokens) - num_tokens labels = [config.CLS_TOKEN ] + labels.split(" ") + [config.SEP_TOKEN] new_labels = [] j = 0 joiner = '-' for i in range(len(tokens)): if tag[i] == 0 and tokens[i] != tokenizer.pad_token: cur_type = labels[j] new_labels.append(cur_type) if cur_type != 'O': joiner = cur_type[1] prev_label = cur_type[2:] else: prev_label = cur_type j += 1 elif tag[i] == 1 and tokens[ i] != tokenizer.pad_token: # 是添加的实体 new_labels.append('[ENT]') elif tag[i] == 2: if prev_label == 'O': new_labels.append('O') else: if args.use_subword_tag: new_labels.append('[X]') else: new_labels.append(f'I{joiner}' + prev_label) else: new_labels.append(PAD_TOKEN) new_labels = [labels_map[l] for l in new_labels] # print(tokens) # print(labels) # print(tag) mask = [1] * (num_tokens) + [0] * num_pad word_segment_ids = [0] * (len(tokens)) # print(len(tokens)) # print(len(tag)) # exit() # print(tokenizer.pad_token_id) # for i in range(len(tokens)): # if tag[i] == 0 and tokens[i] != tokenizer.pad_token: # new_labels.append(labels[j]) # j += 1 # elif tag[i] == 1 and tokens[i] != tokenizer.pad_token: # 是添加的实体 # new_labels.append(labels_map['[ENT]']) # elif tag[i] == 2: # if args.use_subword_tag: # new_labels.append(labels_map['[X]']) # else: # new_labels.append(labels_map['[ENT]']) # else: # new_labels.append(labels_map[PAD_TOKEN]) # print(labels) # print(new_labels) # print([idx_to_label.get(key) for key in labels]) # print([idx_to_label.get(key) for key in labels]) # print(mask) # print(pos) # print(word_segment_ids) # print(tokens) # tokens = tokenizer.convert_tokens_to_ids([tokenizer.cls_token] + tokens + [tokenizer.sep_token]) tokens = tokenizer.convert_tokens_to_ids(tokens) # print(tokens) # exit() assert len(tokens) == len(new_labels), AssertionError( "The length of token and label is not matching") dataset.append( [tokens, new_labels, mask, pos, vm, tag, word_segment_ids]) # Enable dry rune if args.dry_run: count += 1 if count == 100: break return dataset # Evaluation function. def evaluate(args, is_test, final=False): if is_test: dataset = read_dataset(args.test_path) else: dataset = read_dataset(args.dev_path) input_ids = torch.LongTensor([sample[0] for sample in dataset]) label_ids = torch.LongTensor([sample[1] for sample in dataset]) mask_ids = torch.LongTensor([sample[2] for sample in dataset]) pos_ids = torch.LongTensor([sample[3] for sample in dataset]) vm_ids = torch.BoolTensor([sample[4] for sample in dataset]) tag_ids = torch.LongTensor([sample[5] for sample in dataset]) segment_ids = torch.LongTensor([sample[6] for sample in dataset]) instances_num = input_ids.size(0) batch_size = args.batch_size if is_test: print("Batch size: ", batch_size) print("The number of test instances:", instances_num) correct = 0 correct_with_type = 0 gold_entities_num = 0 pred_entities_num = 0 confusion = torch.zeros(len(labels_map), len(labels_map), dtype=torch.long) model.eval() for i, (input_ids_batch, label_ids_batch, mask_ids_batch, pos_ids_batch, vm_ids_batch, tag_ids_batch, segment_ids_batch) in enumerate( batch_loader(batch_size, input_ids, label_ids, mask_ids, pos_ids, vm_ids, tag_ids, segment_ids)): input_ids_batch = input_ids_batch.to(device) label_ids_batch = label_ids_batch.to(device) mask_ids_batch = mask_ids_batch.to(device) pos_ids_batch = pos_ids_batch.to(device) tag_ids_batch = tag_ids_batch.to(device) vm_ids_batch = vm_ids_batch.long().to(device) segment_ids_batch = segment_ids_batch.long().to(device) loss, _, pred, gold, _ = model(input_ids_batch, segment_ids_batch, mask_ids_batch, label_ids_batch, pos_ids_batch, vm_ids_batch, use_kg=args.use_kg) if final: with open(f'{args.output_file_prefix}_predictions.txt', 'a') as p, \ open(f'{args.output_file_prefix}_gold.txt', 'a') as g, \ open(f'{args.output_file_prefix}_text.txt', 'a') as t: predicted_labels = [ idx_to_label.get(key) for key in pred.tolist() ] gold_labels = [ idx_to_label.get(key) for key in gold.tolist() ] num_tokens = len(predicted_labels) mask_ids_batch = mask_ids_batch.view(-1, num_tokens) masks = mask_ids_batch.tolist()[0] input_ids_batch = input_ids_batch.view(-1, num_tokens) tokens = input_ids_batch.tolist()[0] for start_idx in range(0, num_tokens, args.seq_length): pred_sample = predicted_labels[start_idx:start_idx + args.seq_length] gold_sample = gold_labels[start_idx:start_idx + args.seq_length] mask = masks[start_idx:start_idx + args.seq_length] num_labels = sum(mask) token_sample = tokens[start_idx:start_idx + args.seq_length] token_sample = token_sample[:num_labels] text = ''.join( tokenizer.convert_ids_to_tokens(token_sample)) text = bytearray([byte_decoder[c] for c in text]).decode('utf-8') p.write(' '.join(pred_sample[:num_labels]) + '\n') g.write(' '.join(gold_sample[:num_labels]) + '\n') t.write(text + '\n') for j in range(gold.size()[0]): if gold[j].item() in begin_ids: gold_entities_num += 1 for j in range(pred.size()[0]): if pred[j].item( ) in begin_ids and gold[j].item() != labels_map["[PAD]"]: pred_entities_num += 1 pred_entities_pos = [] pred_entities_pos_with_type = [] gold_entities_pos = [] gold_entities_pos_with_type = [] start, end = 0, 0 for j in range(gold.size()[0]): if gold[j].item() in begin_ids: start = j for k in range(j + 1, gold.size()[0]): if gold[k].item() == labels_map['[X]'] or gold[k].item( ) == labels_map['[ENT]']: continue if gold[k].item( ) == labels_map["[PAD]"] or gold[k].item( ) == labels_map["O"] or gold[k].item() in begin_ids: end = k - 1 break else: end = gold.size()[0] - 1 if args.eval_range_with_types: ent_type_gold = idx_to_label.get(gold[start].item()) ent_type_gold = ent_type_gold.replace('_NOKG', '') gold_entities_pos_with_type.append( (start, end, ent_type_gold)) gold_entities_pos.append((start, end)) for j in range(pred.size()[0]): if pred[j].item() in begin_ids and gold[j].item() != labels_map["[PAD]"] and gold[j].item() != \ labels_map["[ENT]"] and gold[j].item() != labels_map["[X]"]: start = j for k in range(j + 1, pred.size()[0]): if pred[k].item() == labels_map['[X]'] or gold[k].item( ) == labels_map['[ENT]']: continue if pred[k].item( ) == labels_map["[PAD]"] or pred[k].item( ) == labels_map["O"] or pred[k].item() in begin_ids: end = k - 1 break else: end = pred.size()[0] - 1 if args.eval_range_with_types: # Get all the labels in the range if start == end: entity_types = [ idx_to_label.get(l.item()) for l in [pred[start]] ] else: entity_types = [ idx_to_label.get(l.item()) for l in pred[start:end] ] # Run voting choicer final_entity_type = voting_choicer(entity_types) final_entity_type = final_entity_type.replace( '_NOKG', '') if final: logger.info( f'Predicted: {" ".join(entity_types)}, Selected: {final_entity_type}' ) if args.voting_choicer: # Convert back to label id and add in the tuple pred_entities_pos_with_type.append( (start, end, final_entity_type)) else: # Use the first prediction ent_type_pred = idx_to_label.get( pred[start].item()) ent_type_pred = ent_type_pred.replace('_NOKG', '') pred_entities_pos_with_type.append( (start, end, ent_type_pred)) pred_entities_pos.append((start, end)) for entity in pred_entities_pos: if entity not in gold_entities_pos: continue else: correct += 1 if args.eval_range_with_types: for entity in pred_entities_pos_with_type: if entity not in gold_entities_pos_with_type: continue else: correct_with_type += 1 try: print("Report precision, recall, and f1:") p = correct / pred_entities_num r = correct / gold_entities_num f1 = 2 * p * r / (p + r) print("{:.3f}, {:.3f}, {:.3f}".format(p, r, f1)) if args.eval_range_with_types: try: print( "Report accuracy with type, precision, recall, and f1:" ) p_with_type = correct_with_type / pred_entities_num r_with_type = correct_with_type / gold_entities_num f1_with_type = 2 * p_with_type * r_with_type / ( p_with_type + r_with_type) print("{:.3f}, {:.3f}, {:.3f}".format( p_with_type, r_with_type, f1_with_type)) except: pass return f1 except ZeroDivisionError: return 0 # Training phase. print("Start training.") instances = read_dataset(args.train_path) input_ids = torch.LongTensor([ins[0] for ins in instances]) label_ids = torch.LongTensor([ins[1] for ins in instances]) mask_ids = torch.LongTensor([ins[2] for ins in instances]) pos_ids = torch.LongTensor([ins[3] for ins in instances]) vm_ids = torch.BoolTensor([ins[4] for ins in instances]) tag_ids = torch.LongTensor([ins[5] for ins in instances]) segment_ids = torch.LongTensor([ins[6] for ins in instances]) instances_num = input_ids.size(0) batch_size = args.batch_size train_steps = int(instances_num * args.epochs_num / batch_size) + 1 train_batcher = Batcher(batch_size, input_ids, label_ids, mask_ids, pos_ids, vm_ids, tag_ids, segment_ids) print("Batch size: ", batch_size) print("The number of training instances:", instances_num) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup, t_total=train_steps) total_loss = 0. f1 = 0.0 best_f1 = 0.0 # Dry evaluate # evaluate(args, True) for epoch in range(1, args.epochs_num + 1): model.train() for i, (input_ids_batch, label_ids_batch, mask_ids_batch, pos_ids_batch, vm_ids_batch, tag_ids_batch, segment_ids_batch) in enumerate(train_batcher): model.zero_grad() input_ids_batch = input_ids_batch.to(device) label_ids_batch = label_ids_batch.to(device) mask_ids_batch = mask_ids_batch.to(device) pos_ids_batch = pos_ids_batch.to(device) tag_ids_batch = tag_ids_batch.to(device) vm_ids_batch = vm_ids_batch.long().to(device) segment_ids_batch = segment_ids_batch.long().to(device) loss, _, _, _, _ = model(input_ids_batch, segment_ids_batch, mask_ids_batch, label_ids_batch, pos_ids_batch, vm_ids_batch, use_kg=args.use_kg) if torch.cuda.device_count() > 1: loss = torch.mean(loss) total_loss += loss.item() if (i + 1) % args.report_steps == 0: print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}". format(epoch, i + 1, total_loss / args.report_steps)) total_loss = 0. loss.backward() optimizer.step() # Evaluation phase. print("Start evaluate on dev dataset.") f1 = evaluate(args, False) print("Start evaluation on test dataset.") evaluate(args, True) if f1 > best_f1: best_f1 = f1 save_model(model, args.output_model_path) save_encoder(args, encoder, suffix=args.suffix_file_encoder) else: continue # Evaluation phase. print("Final evaluation on test dataset.") if torch.cuda.device_count() > 1: model.module.load_state_dict(torch.load(args.output_model_path)) else: model.load_state_dict(torch.load(args.output_model_path)) evaluate(args, True, final=True)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) infer_opts(parser) parser.add_argument("--vocab_path", default=None, type=str, help="Path of the vocabulary file.") parser.add_argument("--spm_model_path", default=None, type=str, help="Path of the sentence piece model.") parser.add_argument("--label2id_path", type=str, required=True, help="Path of the label2id file.") parser.add_argument( "--crf_target", action="store_true", help="Use CRF loss as the target function or not, default False.") args = parser.parse_args() # Load the hyperparameters of the config file. args = load_hyperparam(args) with open(args.label2id_path, mode="r", encoding="utf-8") as f: l2i = json.load(f) print("Labels: ", l2i) l2i["[PAD]"] = len(l2i) i2l = {} for key, value in l2i.items(): i2l[value] = key args.l2i = l2i args.labels_num = len(l2i) # Load tokenizer. args.tokenizer = SpaceTokenizer(args) # Build sequence labeling model. model = NerTagger(args) model = load_model(model, args.load_model_path) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) instances = read_dataset(args, args.test_path) src = torch.LongTensor([ins[0] for ins in instances]) seg = torch.LongTensor([ins[1] for ins in instances]) instances_num = src.size(0) batch_size = args.batch_size print("The number of prediction instances: ", instances_num) model.eval() with open(args.prediction_path, mode="w", encoding="utf-8") as f: f.write("pred_label" + "\n") for i, (src_batch, seg_batch) in enumerate(batch_loader(batch_size, src, seg)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) with torch.no_grad(): _, pred = model(src_batch, None, seg_batch) # Storing sequence length of instances in a batch. seq_length_batch = [] for seg in seg_batch.cpu().numpy().tolist(): for j in range(len(seg) - 1, -1, -1): if seg[j] != 0: break seq_length_batch.append(j + 1) pred = pred.cpu().numpy().tolist() for j in range(0, len(pred), args.seq_length): for label_id in pred[j:j + seq_length_batch[j // args.seq_length]]: f.write(i2l[label_id] + " ") f.write("\n")
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--pretrained_model_path", default=None, type=str, help="Path of the pretrained model.") parser.add_argument("--dataset_path_list", default=[], nargs='+', type=str, help="Dataset path list.") parser.add_argument("--output_model_path", default="./models/multitask_classifier_model.bin", type=str, help="Path of the output model.") parser.add_argument("--vocab_path", default=None, type=str, help="Path of the vocabulary file.") parser.add_argument("--spm_model_path", default=None, type=str, help="Path of the sentence piece model.") parser.add_argument("--config_path", default="./models/bert_base_config.json", type=str, help="Path of the config file.") # Model options. parser.add_argument("--batch_size", type=int, default=32, help="Batch size.") parser.add_argument("--seq_length", type=int, default=128, help="Sequence length.") parser.add_argument("--embedding", choices=["bert", "word"], default="bert", help="Emebdding type.") parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \ "cnn", "gatedcnn", "attn", "synt", \ "rcnn", "crnn", "gpt", "bilstm"], \ default="bert", help="Encoder type.") parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.") parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first", help="Pooling type.") parser.add_argument("--factorized_embedding_parameterization", action="store_true", help="Factorized embedding parameterization.") parser.add_argument("--parameter_sharing", action="store_true", help="Parameter sharing.") # Tokenizer options. parser.add_argument( "--tokenizer", choices=["bert", "char", "space"], default="bert", help="Specify the tokenizer." "Original Google BERT uses bert tokenizer on Chinese corpus." "Char tokenizer segments sentences into characters." "Space tokenizer segments sentences into words according to space.") # Optimizer options. parser.add_argument("--soft_targets", action='store_true', help="Train model with logits.") parser.add_argument("--learning_rate", type=float, default=2e-5, help="Learning rate.") parser.add_argument("--warmup", type=float, default=0.1, help="Warm up value.") parser.add_argument( "--fp16", action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit." ) parser.add_argument( "--fp16_opt_level", choices=["O0", "O1", "O2", "O3"], default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") # Training options. parser.add_argument("--dropout", type=float, default=0.5, help="Dropout.") parser.add_argument("--epochs_num", type=int, default=3, help="Number of epochs.") parser.add_argument("--report_steps", type=int, default=100, help="Specific steps to print prompt.") parser.add_argument("--seed", type=int, default=7, help="Random seed.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) set_seed(args.seed) # Count the number of labels. args.labels_num_list = [ count_labels_num(os.path.join(path, "train.tsv")) for path in args.dataset_path_list ] args.datasets_num = len(args.dataset_path_list) # Build tokenizer. args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args) # Build multi-task classification model. model = MultitaskClassifier(args) # Load or initialize parameters. load_or_initialize_parameters(args, model) args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(args.device) args.model = model # Training phase. dataset_list = [ read_dataset(args, os.path.join(path, "train.tsv")) for path in args.dataset_path_list ] packed_dataset_list = [ pack_dataset(dataset, i, args.batch_size) for i, dataset in enumerate(dataset_list) ] packed_dataset_all = [] for packed_dataset in packed_dataset_list: packed_dataset_all += packed_dataset random.shuffle(packed_dataset_all) instances_num = sum([len(dataset) for dataset in dataset_list]) batch_size = args.batch_size args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1 print("Batch size: ", batch_size) print("The number of training instances:", instances_num) optimizer, scheduler = build_optimizer(args, model) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) args.amp = amp if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) total_loss, result, best_result = 0., 0., 0. print("Start training.") for epoch in range(1, args.epochs_num + 1): model.train() for i, (dataset_id, src_batch, tgt_batch, seg_batch) in enumerate(packed_dataset_all): if hasattr(model, "module"): model.module.change_dataset(dataset_id) else: model.change_dataset(dataset_id) loss = train_model(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_batch, None) total_loss += loss.item() if (i + 1) % args.report_steps == 0: print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}". format(epoch, i + 1, total_loss / args.report_steps)) total_loss = 0. for dataset_id, path in enumerate(args.dataset_path_list): args.labels_num = args.labels_num_list[dataset_id] if hasattr(model, "module"): model.module.change_dataset(dataset_id) else: model.change_dataset(dataset_id) result = evaluate( args, read_dataset(args, os.path.join(path, "dev.tsv"))) save_model(model, args.output_model_path)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--dataset_path", type=str, default="dataset.pt", help="Path of the preprocessed dataset.") parser.add_argument("--vocab_path", type=str, required=True, help="Path of the vocabulary file.") parser.add_argument("--pretrained_model_path", type=str, default=None, help="Path of the pretrained model.") parser.add_argument("--output_model_path", type=str, required=True, help="Path of the output model.") parser.add_argument("--config_path", type=str, default=None, help="Config file of model hyper-parameters.") # Training and saving options. parser.add_argument("--total_steps", type=int, default=100000, help="Total training steps.") parser.add_argument("--save_checkpoint_steps", type=int, default=10000, help="Specific steps to save model checkpoint.") parser.add_argument("--report_steps", type=int, default=100, help="Specific steps to print prompt.") parser.add_argument("--accumulation_steps", type=int, default=1, help="Specific steps to accumulate gradient.") parser.add_argument( "--batch_size", type=int, default=32, help= "Training batch size. The actual batch_size is [batch_size x world_size x accumulation_steps]." ) parser.add_argument("--instances_buffer_size", type=int, default=25600, help="The buffer size of instances in memory.") # Model options. parser.add_argument("--emb_size", type=int, default=768, help="Embedding dimension.") parser.add_argument("--hidden_size", type=int, default=768, help="Hidden state dimension.") parser.add_argument("--feedforward_size", type=int, default=3072, help="Feed forward layer dimension.") parser.add_argument("--kernel_size", type=int, default=3, help="Kernel size for CNN.") parser.add_argument("--block_size", type=int, default=2, help="Block size for CNN.") parser.add_argument("--heads_num", type=int, default=12, help="The number of heads in multi-head attention.") parser.add_argument("--layers_num", type=int, default=12, help="The number of encoder layers.") parser.add_argument("--dropout", type=float, default=0.1, help="Dropout value.") parser.add_argument("--seed", type=int, default=7, help="Random seed.") parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \ "cnn", "gatedcnn", "attn", \ "rcnn", "crnn", "gpt", "bilstm"], \ default="bert", help="Encoder type.") parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.") parser.add_argument("--target", choices=["bert", "lm", "cls", "mlm", "bilm"], default="bert", help="The training target of the pretraining model.") parser.add_argument("--labels_num", type=int, default=2, help="Specific to classification target.") # Optimizer options. parser.add_argument("--learning_rate", type=float, default=2e-5, help="Initial learning rate.") parser.add_argument("--warmup", type=float, default=0.1, help="Warm up value.") # Subword options. parser.add_argument("--subword_type", choices=["none", "char"], default="none", help="Subword feature type.") parser.add_argument("--sub_vocab_path", type=str, default="models/sub_vocab.txt", help="Path of the subword vocabulary file.") parser.add_argument("--subencoder", choices=["avg", "lstm", "gru", "cnn"], default="avg", help="Subencoder type.") parser.add_argument("--sub_layers_num", type=int, default=2, help="The number of subencoder layers.") # GPU options. parser.add_argument("--world_size", type=int, default=1, help="Total number of processes (GPUs) for training.") parser.add_argument( "--gpu_ranks", default=[], nargs='+', type=int, help="List of ranks of each process." " Each process has a unique integer rank whose value is in the interval [0, world_size), and runs in a single GPU." ) parser.add_argument("--master_ip", default="tcp://localhost:12345", type=str, help="IP-Port of master for training.") parser.add_argument("--backend", choices=["nccl", "gloo"], default="nccl", type=str, help="Distributed backend.") args = parser.parse_args() # Load hyper-parameters from config file. if args.config_path: load_hyperparam(args) ranks_num = len(args.gpu_ranks) if args.world_size > 1: # Multiprocessing distributed mode. assert torch.cuda.is_available(), "No available GPUs." assert ranks_num <= args.world_size, "Started processes exceed `world_size` upper limit." assert ranks_num <= torch.cuda.device_count( ), "Started processes exceeds the available GPUs." args.dist_train = True args.ranks_num = ranks_num print("Using distributed mode for training.") elif args.world_size == 1 and ranks_num == 1: # Single GPU mode. assert torch.cuda.is_available(), "No available GPUs." args.gpu_id = args.gpu_ranks[0] assert args.gpu_id < torch.cuda.device_count( ), "Invalid specified GPU device." args.dist_train = False args.single_gpu = True print("Using single GPU:%d for training." % args.gpu_id) else: # CPU mode. assert ranks_num == 0, "GPUs are specified, please check the arguments." args.dist_train = False args.single_gpu = False print("Using CPU mode for training.") trainer.train_and_validate(args)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--pretrained_model_path", default=None, type=str, help="Path of the pretrained model.") parser.add_argument("--output_model_path", default="./models/QA_model.bin", type=str, help="Path of the output model.") parser.add_argument("--vocab_path", default="./models/google_vocab.txt", type=str, help="Path of the vocabulary file.") parser.add_argument("--train_path", type=str, required=True, help="Path of the trainset.") parser.add_argument("--dev_path", type=str, required=True, help="Path of the devset.") parser.add_argument("--test_path", type=str, help="Path of the testset.") parser.add_argument("--config_path", default="./models/google_config.json", type=str, help="Path of the config file.") # Model options. parser.add_argument("--batch_size", type=int, default=64, help="Batch size.") parser.add_argument("--seq_length", type=int, default=100, help="Sequence length.") parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument("--embedding", choices=["bert", "word"], default="bert", help="Emebdding type.") parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \ "cnn", "gatedcnn", "attn", \ "rcnn", "crnn", "gpt"], \ default="bert", help="Encoder type.") parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.") # Subword options. parser.add_argument("--subword_type", choices=["none", "char"], default="none", help="Subword feature type.") parser.add_argument("--sub_vocab_path", type=str, default="models/sub_vocab.txt", help="Path of the subword vocabulary file.") parser.add_argument("--subencoder", choices=["avg", "lstm", "gru", "cnn"], default="avg", help="Subencoder type.") parser.add_argument("--sub_layers_num", type=int, default=2, help="The number of subencoder layers.") # Tokenizer options. parser.add_argument( "--tokenizer", choices=["bert", "char", "space"], default="char", help="Specify the tokenizer." "Original Google BERT uses bert tokenizer on Chinese corpus." "Char tokenizer segments sentences into characters." "Space tokenizer segments sentences into words according to space.") # Optimizer options. parser.add_argument("--learning_rate", type=float, default=3e-5, help="Learning rate.") parser.add_argument("--warmup", type=float, default=0.1, help="Warm up value.") # Training options. parser.add_argument("--dropout", type=float, default=0.5, help="Dropout.") parser.add_argument("--epochs_num", type=int, default=3, help="Number of epochs.") parser.add_argument("--report_steps", type=int, default=100, help="Specific steps to print prompt.") parser.add_argument("--seed", type=int, default=7, help="Random seed.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) set_seed(args.seed) # Load vocabulary. vocab = Vocab() vocab.load(args.vocab_path) args.vocab = vocab args.target = "bert" bert_model = build_model(args) # Load or initialize parameters. if args.pretrained_model_path is not None: # Initialize with pretrained model. bert_model.load_state_dict(torch.load(args.pretrained_model_path), strict=False) else: # Initialize with normal distribution. for n, p in list(bert_model.named_parameters()): if 'gamma' not in n and 'beta' not in n: p.data.normal_(0, 0.02) # Build QA model. model = BertQuestionAnswering(args, bert_model) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = nn.DataParallel(model) model = model.to(device) # Dataset loader. def batch_loader(batch_size, input_ids, mask_ids, start_positions, end_positions): instances_num = input_ids.size()[0] for i in range(instances_num // batch_size): input_ids_batch = input_ids[i * batch_size:(i + 1) * batch_size, :] mask_ids_batch = mask_ids[i * batch_size:(i + 1) * batch_size, :] start_positions_batch = start_positions[i * batch_size:(i + 1) * batch_size] end_positions_batch = end_positions[i * batch_size:(i + 1) * batch_size] yield input_ids_batch, mask_ids_batch, start_positions_batch, end_positions_batch if instances_num > instances_num // batch_size * batch_size: input_ids_batch = input_ids[instances_num // batch_size * batch_size:, :] mask_ids_batch = mask_ids[instances_num // batch_size * batch_size:, :] start_positions_batch = start_positions[instances_num // batch_size * batch_size:] end_positions_batch = end_positions[instances_num // batch_size * batch_size:] yield input_ids_batch, mask_ids_batch, start_positions_batch, end_positions_batch # Build tokenizer. tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args) # Read examples. def read_examples(path): examples = [] with open(path, 'r', encoding='utf-8') as fp: all_dict = json.loads(fp.read()) v1 = all_dict["data"] for i in range(len(v1)): data_dict = v1[i] v2 = data_dict["paragraphs"] for j in range(len(v2)): para_dict = v2[j] context = para_dict["context"] v3 = para_dict["qas"] for m in range(len(v3)): qas_dict = v3[m] question = qas_dict["question"] question_id = qas_dict["id"] v4 = qas_dict["answers"] answers = [] start_positions = [] end_positions = [] for n in range(len(v4)): ans_dict = v4[n] answer = ans_dict["text"] start_position = ans_dict["answer_start"] end_position = start_position + len(answer) answers.append(answer) start_positions.append(start_position) end_positions.append(end_position) examples.append( (context, question, question_id, start_positions, end_positions, answers)) return examples def convert_examples_to_dataset(examples, args): dataset = [] print("The number of questions in the dataset", len(examples)) for i in range(len(examples)): context = examples[i][0] question = examples[i][1] q_len = len(question) question_id = examples[i][2] start_positions_true = examples[i][3][0] #待修改 end_positions_true = examples[i][4][0] answers = examples[i][5] max_context_length = args.seq_length - q_len - 3 # divide the context to some spans _DocSpan = collections.namedtuple( # pylint: disable=invalid-name "DocSpan", ["start", "length"]) doc_spans = [] start_offset = 0 while start_offset < len(context): length = len(context) - start_offset if length > max_context_length: length = max_context_length doc_spans.append(_DocSpan(start=start_offset, length=length)) if start_offset + length == len(context): break start_offset += min(length, args.doc_stride) for (doc_span_index, doc_span) in enumerate(doc_spans): doc_span_start = doc_span.start span_context = context[doc_span_start:doc_span_start + doc_span.length] # convert the start or end position to real position in tokens start_positions = start_positions_true - doc_span_start + q_len + 2 end_positions = end_positions_true - doc_span_start + q_len + 2 # the answers of some question are not in the doc_span, we ignore them. if start_positions < q_len + 2 or start_positions > doc_span.length + q_len + 2 or end_positions < q_len + 2 or end_positions > doc_span.length + q_len + 2: continue tokens_a = [vocab.get(t) for t in tokenizer.tokenize(question)] tokens_a = [CLS_ID] + tokens_a + [SEP_ID] tokens_b = [ vocab.get(t) for t in tokenizer.tokenize(span_context) ] tokens_b = tokens_b + [SEP_ID] tokens = tokens_a + tokens_b mask = [1] * len(tokens_a) + [2] * len(tokens_b) while len(tokens) < args.seq_length: tokens.append(0) mask.append(0) dataset.append( (tokens, mask, start_positions, end_positions, answers, question_id, q_len, doc_span_index, doc_span_start)) return dataset # Evaluation function. def evaluate(args, is_test): # some calculation functions def mixed_segmentation(in_str, rm_punc=False): in_str = str(in_str).lower().strip() segs_out = [] temp_str = "" sp_char = [ '-', ':', '_', '*', '^', '/', '\\', '~', '`', '+', '=', ',', '。', ':', '?', '!', '“', '”', ';', '’', '《', '》', '……', '·', '、', '「', '」', '(', ')', '-', '~', '『', '』' ] for char in in_str: if rm_punc and char in sp_char: continue if re.search(r'[\u4e00-\u9fa5]', char) or char in sp_char: if temp_str != "": ss = nltk.word_tokenize(temp_str) segs_out.extend(ss) temp_str = "" segs_out.append(char) else: temp_str += char #handling last part if temp_str != "": ss = nltk.word_tokenize(temp_str) segs_out.extend(ss) return segs_out # remove punctuation def remove_punctuation(in_str): in_str = str(in_str).lower().strip() sp_char = [ '-', ':', '_', '*', '^', '/', '\\', '~', '`', '+', '=', ',', '。', ':', '?', '!', '“', '”', ';', '’', '《', '》', '……', '·', '、', '「', '」', '(', ')', '-', '~', '『', '』' ] out_segs = [] for char in in_str: if char in sp_char: continue else: out_segs.append(char) return ''.join(out_segs) # find longest common string def find_lcs(s1, s2): m = [[0 for i in range(len(s2) + 1)] for j in range(len(s1) + 1)] mmax = 0 p = 0 for i in range(len(s1)): for j in range(len(s2)): if s1[i] == s2[j]: m[i + 1][j + 1] = m[i][j] + 1 if m[i + 1][j + 1] > mmax: mmax = m[i + 1][j + 1] p = i + 1 return s1[p - mmax:p], mmax def calc_f1_score(answers, prediction): f1_scores = [] for i in range(len(answers)): ans = answers[i] ans_segs = mixed_segmentation(ans, rm_punc=True) prediction_segs = mixed_segmentation(prediction, rm_punc=True) lcs, lcs_len = find_lcs(ans_segs, prediction_segs) if lcs_len == 0: f1_scores.append(0) else: precision = 1.0 * lcs_len / len(prediction_segs) recall = 1.0 * lcs_len / len(ans_segs) f1 = (2 * precision * recall) / (precision + recall) f1_scores.append(f1) return max(f1_scores) def calc_em_score(answers, prediction): em = 0 for i in range(len(answers)): ans = answers[i] ans_ = remove_punctuation(ans) prediction_ = remove_punctuation(prediction) if ans_ == prediction_: em = 1 break return em def is_max_score(score_list): score_max = -100 index_max = 0 best_start_prediction = 0 best_end_prediction = 0 for i in range(len(score_list)): if score_max <= score_list[i][3]: score_max = score_list[i][3] index_max = score_list[i][0] best_start_prediction = score_list[i][1] best_end_prediction = score_list[i][2] return index_max, best_start_prediction, best_end_prediction if is_test: examples = read_examples(args.test_path) dataset = convert_examples_to_dataset(examples, args) else: examples = read_examples(args.dev_path) dataset = convert_examples_to_dataset(examples, args) input_ids = torch.LongTensor([sample[0] for sample in dataset]) mask_ids = torch.LongTensor([sample[1] for sample in dataset]) start_positions = torch.LongTensor([sample[2] for sample in dataset]) end_positions = torch.LongTensor([sample[3] for sample in dataset]) batch_size = args.batch_size instances_num = input_ids.size()[0] if is_test: print("The number of evaluation instances: ", instances_num) model.eval() start_logits_all = [] end_logits_all = [] start_pred_all = [] end_pred_all = [] for i, (input_ids_batch, mask_ids_batch, start_positions_batch, end_positions_batch) in enumerate( batch_loader(batch_size, input_ids, mask_ids, start_positions, end_positions)): model.zero_grad() input_ids_batch = input_ids_batch.to(device) mask_ids_batch = mask_ids_batch.to(device) start_positions_batch = start_positions_batch.to(device) end_positions_batch = end_positions_batch.to(device) with torch.no_grad(): loss, start_logits, end_logits = model(input_ids_batch, mask_ids_batch, start_positions_batch, end_positions_batch) start_logits = nn.Softmax(dim=1)(start_logits) end_logits = nn.Softmax(dim=1)(end_logits) start_pred = torch.argmax(start_logits, dim=1) end_pred = torch.argmax(end_logits, dim=1) start_pred = start_pred.cpu().numpy().tolist() end_pred = end_pred.cpu().numpy().tolist() start_logits = start_logits.cpu().numpy().tolist() end_logits = end_logits.cpu().numpy().tolist() start_logits_max = [] end_logits_max = [] for j in range(len(start_pred)): start_logits_max.append(start_logits[j][start_pred[j]]) end_logits_max.append(end_logits[j][end_pred[j]]) start_logits_all += start_logits_max end_logits_all += end_logits_max start_pred_all += start_pred end_pred_all += end_pred assert len(start_pred_all) == len(dataset) assert len(start_logits_all) == len(dataset) # couster by question id and chose the best answer in doc_spans order = -1 pred_list = [] templist = [] for i in range(len(dataset)): qid = dataset[i][5] q_len = dataset[i][6] span_index = dataset[i][7] doc_span_start = dataset[i][8] score1 = float(start_logits_all[i]) score2 = float(end_logits_all[i]) score = (score1 + score2) / 2 pre_start_pred = start_pred_all[i] + doc_span_start - q_len - 2 pre_end_pred = end_pred_all[i] + doc_span_start - q_len - 2 if qid == order: templist.append( (span_index, pre_start_pred, pre_end_pred, score)) else: order = qid if i > 0: span_index_max, best_start_prediction, best_end_prediction = is_max_score( templist) pred_list.append((span_index_max, best_start_prediction, best_end_prediction)) templist = [] templist.append( (span_index, pre_start_pred, pre_end_pred, score)) span_index_max, best_start_prediction, best_end_prediction = is_max_score( templist) pred_list.append( (span_index_max, best_start_prediction, best_end_prediction)) assert len(pred_list) == len(examples) #strat pred f1 = 0 em = 0 total_count = len(examples) skip_count = 0 for i in range(len(examples)): question_id = examples[i][2] answers = examples[i][5] span_index = pred_list[i][0] start_prediction = pred_list[i][1] end_prediction = pred_list[i][2] #error prediction if end_prediction <= start_prediction: skip_count += 1 continue prediction = examples[i][0][start_prediction:end_prediction] f1 += calc_f1_score(answers, prediction) em += calc_em_score(answers, prediction) f1_score = 100.0 * f1 / total_count em_score = 100.0 * em / total_count avg = (f1_score + em_score) * 0.5 print("Avg: {:.4f},F1:{:.4f},EM:{:.4f},Total:{},Skip:{}".format( avg, f1_score, em_score, total_count, skip_count)) return avg # Training phase print("Start training.") batch_size = args.batch_size print("Batch size: ", batch_size) examples = read_examples(args.train_path) trainset = convert_examples_to_dataset(examples, args) random.shuffle(trainset) instances_num = len(trainset) input_ids = torch.LongTensor([sample[0] for sample in trainset]) mask_ids = torch.LongTensor([sample[1] for sample in trainset]) start_positions = torch.LongTensor([sample[2] for sample in trainset]) end_positions = torch.LongTensor([sample[3] for sample in trainset]) train_steps = int(instances_num * args.epochs_num / batch_size) + 1 print("The number of training instances:", instances_num) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup, t_total=train_steps) total_loss = 0. result = 0.0 best_result = 0.0 for epoch in range(1, args.epochs_num + 1): model.train() for i, (input_ids_batch, mask_ids_batch, start_positions_batch, end_positions_batch) in enumerate( batch_loader(batch_size, input_ids, mask_ids, start_positions, end_positions)): model.zero_grad() input_ids_batch = input_ids_batch.to(device) mask_ids_batch = mask_ids_batch.to(device) start_positions_batch = start_positions_batch.to(device) end_positions_batch = end_positions_batch.to(device) loss, _, _ = model(input_ids_batch, mask_ids_batch, start_positions_batch, end_positions_batch) if torch.cuda.device_count() > 1: loss = torch.mean(loss) total_loss += loss.item() if (i + 1) % args.report_steps == 0: print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}". format(epoch, i + 1, total_loss / args.report_steps)) total_loss = 0. loss.backward() optimizer.step() result = evaluate(args, False) if result > best_result: best_result = result save_model(model, args.output_model_path) else: continue # Evaluation phase. if args.test_path is not None: print("Test set evaluation.") model = load_model(model, args.output_model_path) evaluate(args, True)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) infer_opts(parser) parser.add_argument("--vocab_path", default=None, type=str, help="Path of the vocabulary file.") parser.add_argument("--spm_model_path", default=None, type=str, help="Path of the sentence piece model.") parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = CharTokenizer(args) # Build model and load parameters. model = MachineReadingComprehension(args) model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) dataset, examples = read_dataset(args, args.test_path) src = torch.LongTensor([sample[0] for sample in dataset]) seg = torch.LongTensor([sample[1] for sample in dataset]) start_position = torch.LongTensor([sample[2] for sample in dataset]) end_position = torch.LongTensor([sample[3] for sample in dataset]) batch_size = args.batch_size instances_num = len(dataset) print("The number of prediction instances: ", instances_num) model.eval() with open(args.prediction_path, mode="w", encoding="utf-8") as f: start_prob_all, end_prob_all = [], [] for i, (src_batch, seg_batch, start_position_batch, end_position_batch) in enumerate( batch_loader(batch_size, src, seg, start_position, end_position)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) start_position_batch = start_position_batch.to(device) end_position_batch = end_position_batch.to(device) with torch.no_grad(): loss, start_logits, end_logits = model(src_batch, seg_batch, start_position_batch, end_position_batch) start_prob = nn.Softmax(dim=1)(start_logits) end_prob = nn.Softmax(dim=1)(end_logits) for j in range(start_prob.size()[0]): start_prob_all.append(start_prob[j]) end_prob_all.append(end_prob[j]) pred_answers = get_answers(dataset, start_prob_all, end_prob_all) output = {} for i in range(len(examples)): question_id = examples[i][2] start_pred_pos = pred_answers[i][1] end_pred_pos = pred_answers[i][2] prediction = examples[i][0][start_pred_pos:end_pred_pos + 1] output[question_id] = prediction f.write(json.dumps(output, indent=4, ensure_ascii=False) + "\n")
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--pretrained_model_path", default=None, type=str, help="Path of the pretrained model.") parser.add_argument("--output_model_path", default="./models/cmrc_model.bin", type=str, help="Path of the output model.") parser.add_argument("--vocab_path", default=None, type=str, help="Path of the vocabulary file.") parser.add_argument("--spm_model_path", default=None, type=str, help="Path of the sentence piece model.") parser.add_argument("--train_path", type=str, required=True, help="Path of the trainset.") parser.add_argument("--dev_path", type=str, required=True, help="Path of the devset.") parser.add_argument("--test_path", type=str, help="Path of the testset.") parser.add_argument("--config_path", default="./models/bert_base_config.json", type=str, help="Path of the config file.") # Model options. parser.add_argument("--batch_size", type=int, default=64, help="Batch size.") parser.add_argument("--seq_length", type=int, default=100, help="Sequence length.") parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument("--embedding", choices=["bert", "word"], default="bert", help="Emebdding type.") parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \ "cnn", "gatedcnn", "attn", "synt", \ "rcnn", "crnn", "gpt", "bilstm"], \ default="bert", help="Encoder type.") parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.") parser.add_argument("--factorized_embedding_parameterization", action="store_true", help="Factorized embedding parameterization.") parser.add_argument("--parameter_sharing", action="store_true", help="Parameter sharing.") # Optimizer options. parser.add_argument("--learning_rate", type=float, default=3e-5, help="Learning rate.") parser.add_argument("--warmup", type=float, default=0.1, help="Warm up value.") parser.add_argument( "--fp16", action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( "--fp16_opt_level", choices=["O0", "O1", "O2", "O3"], default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") # Training options. parser.add_argument("--dropout", type=float, default=0.5, help="Dropout.") parser.add_argument("--epochs_num", type=int, default=3, help="Number of epochs.") parser.add_argument("--report_steps", type=int, default=100, help="Specific steps to print prompt.") parser.add_argument("--seed", type=int, default=7, help="Random seed.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) set_seed(args.seed) # Build tokenizer. args.tokenizer = CharTokenizer(args) # Build machine reading comprehension model. model = MachineReadingComprehension(args) # Load or initialize parameters. load_or_initialize_parameters(args, model) args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(args.device) # Build tokenizer. args.tokenizer = CharTokenizer(args) # Training phase. batch_size = args.batch_size print("Batch size: ", batch_size) trainset, _ = read_dataset(args, args.train_path) random.shuffle(trainset) instances_num = len(trainset) src = torch.LongTensor([sample[0] for sample in trainset]) seg = torch.LongTensor([sample[1] for sample in trainset]) start_position = torch.LongTensor([sample[2] for sample in trainset]) end_position = torch.LongTensor([sample[3] for sample in trainset]) args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1 print("The number of training instances:", instances_num) optimizer, scheduler = build_optimizer(args, model) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) args.model = model total_loss = 0. result = 0.0 best_result = 0.0 print("Start training.") for epoch in range(1, args.epochs_num + 1): model.train() for i, (src_batch, seg_batch, start_position_batch, end_position_batch) in enumerate( batch_loader(batch_size, src, seg, start_position, end_position)): loss = train(args, model, optimizer, scheduler, src_batch, seg_batch, start_position_batch, end_position_batch) total_loss += loss.item() if (i + 1) % args.report_steps == 0: print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}". format(epoch, i + 1, total_loss / args.report_steps)) total_loss = 0. result = evaluate(args, *read_dataset(args, args.dev_path)) if result > best_result: best_result = result save_model(model, args.output_model_path) # Evaluation phase. if args.test_path is not None: print("Test set evaluation.") if torch.cuda.device_count() > 1: model.module.load_state_dict(torch.load(args.output_model_path)) else: model.load_state_dict(torch.load(args.output_model_path)) evaluate(args, *read_dataset(args, args.test_path))
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) infer_opts(parser) parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first", help="Pooling type.") parser.add_argument("--labels_num", type=int, required=True, help="Number of prediction labels.") parser.add_argument("--tokenizer", choices=["bert", "char", "space"], default="bert", help="Specify the tokenizer." "Original Google BERT uses bert tokenizer on Chinese corpus." "Char tokenizer segments sentences into characters." "Space tokenizer segments sentences into words according to space." ) parser.add_argument("--output_logits", action="store_true", help="Write logits to output file.") parser.add_argument("--output_prob", action="store_true", help="Write probabilities to output file.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args) # Build classification model and load parameters. args.soft_targets, args.soft_alpha = False, False model = Classifier(args) model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format(torch.cuda.device_count())) model = torch.nn.DataParallel(model) dataset = read_dataset(args, args.test_path) src = torch.LongTensor([sample[0] for sample in dataset]) seg = torch.LongTensor([sample[1] for sample in dataset]) batch_size = args.batch_size instances_num = src.size()[0] print("The number of prediction instances: ", instances_num) model.eval() with open(args.prediction_path, mode="w", encoding="utf-8") as f: f.write("label") if args.output_logits: f.write("\t" + "logits") if args.output_prob: f.write("\t" + "prob") f.write("\n") for i, (src_batch, seg_batch) in enumerate(batch_loader(batch_size, src, seg)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) with torch.no_grad(): _, logits = model(src_batch, None, seg_batch) pred = torch.argmax(logits, dim=1) pred = pred.cpu().numpy().tolist() prob = nn.Softmax(dim=1)(logits) logits = logits.cpu().numpy().tolist() prob = prob.cpu().numpy().tolist() for j in range(len(pred)): f.write(str(pred[j])) if args.output_logits: f.write("\t" + " ".join([str(v) for v in logits[j]])) if args.output_prob: f.write("\t" + " ".join([str(v) for v in prob[j]])) f.write("\n")
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) finetune_opts(parser) parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first", help="Pooling type.") tokenizer_opts(parser) parser.add_argument("--soft_targets", action='store_true', help="Train model with logits.") parser.add_argument("--soft_alpha", type=float, default=0.5, help="Weight of the soft targets loss.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) set_seed(args.seed) # Count the number of labels. args.labels_num = count_labels_num(args.train_path) # Build tokenizer. args.tokenizer = str2tokenizer[args.tokenizer](args) # Build classification model. model = Classifier(args) # Load or initialize parameters. load_or_initialize_parameters(args, model) args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(args.device) # Training phase. trainset = read_dataset(args, args.train_path) random.shuffle(trainset) instances_num = len(trainset) batch_size = args.batch_size src = torch.LongTensor([example[0] for example in trainset]) tgt = torch.LongTensor([example[1] for example in trainset]) seg = torch.LongTensor([example[2] for example in trainset]) if args.soft_targets: soft_tgt = torch.FloatTensor([example[3] for example in trainset]) else: soft_tgt = None args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1 print("Batch size: ", batch_size) print("The number of training instances:", instances_num) optimizer, scheduler = build_optimizer(args, model) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) args.amp = amp if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) args.model = model total_loss, result, best_result = 0.0, 0.0, 0.0 print("Start training.") for epoch in range(1, args.epochs_num + 1): model.train() for i, (src_batch, tgt_batch, seg_batch, soft_tgt_batch) in enumerate( batch_loader(batch_size, src, tgt, seg, soft_tgt)): loss = train_model(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_batch, soft_tgt_batch) total_loss += loss.item() if (i + 1) % args.report_steps == 0: print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}". format(epoch, i + 1, total_loss / args.report_steps)) total_loss = 0.0 result = evaluate(args, read_dataset(args, args.dev_path)) if result[0] > best_result: best_result = result[0] save_model(model, args.output_model_path) # Evaluation phase. if args.test_path is not None: print("Test set evaluation.") if torch.cuda.device_count() > 1: args.model.module.load_state_dict( torch.load(args.output_model_path)) else: args.model.load_state_dict(torch.load(args.output_model_path)) evaluate(args, read_dataset(args, args.test_path), True)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--load_model_path", default=None, type=str, help="Path of the classfier model.") parser.add_argument("--vocab_path", default=None, type=str, help="Path of the vocabulary file.") parser.add_argument("--spm_model_path", default=None, type=str, help="Path of the sentence piece model.") parser.add_argument("--test_path", type=str, help="Path of the testset.") parser.add_argument("--prediction_path", default=None, type=str, help="Path of the prediction file.") parser.add_argument("--config_path", default="./models/bert_base_config.json", type=str, help="Path of the config file.") # Model options. parser.add_argument("--batch_size", type=int, default=64, help="Batch size.") parser.add_argument("--seq_length", type=int, default=512, help="Sequence length.") parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument("--embedding", choices=["bert", "word"], default="bert", help="Emebdding type.") parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \ "cnn", "gatedcnn", "attn", "synt", \ "rcnn", "crnn", "gpt", "bilstm"], \ default="bert", help="Encoder type.") parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.") parser.add_argument("--factorized_embedding_parameterization", action="store_true", help="Factorized embedding parameterization.") parser.add_argument("--parameter_sharing", action="store_true", help="Parameter sharing.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = CharTokenizer(args) # Build model and load parameters. model = MachineReadingComprehension(args) model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) dataset, examples = read_dataset(args, args.test_path) src = torch.LongTensor([sample[0] for sample in dataset]) seg = torch.LongTensor([sample[1] for sample in dataset]) start_position = torch.LongTensor([sample[2] for sample in dataset]) end_position = torch.LongTensor([sample[3] for sample in dataset]) batch_size = args.batch_size instances_num = len(dataset) print("The number of prediction instances: ", instances_num) model.eval() with open(args.prediction_path, mode="w", encoding="utf-8") as f: start_prob_all, end_prob_all = [], [] for i, (src_batch, seg_batch, start_position_batch, end_position_batch) in enumerate( batch_loader(batch_size, src, seg, start_position, end_position)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) start_position_batch = start_position_batch.to(device) end_position_batch = end_position_batch.to(device) with torch.no_grad(): loss, start_logits, end_logits = model(src_batch, seg_batch, start_position_batch, end_position_batch) start_prob = nn.Softmax(dim=1)(start_logits) end_prob = nn.Softmax(dim=1)(end_logits) for j in range(start_prob.size()[0]): start_prob_all.append(start_prob[j]) end_prob_all.append(end_prob[j]) pred_answers = get_answers(dataset, start_prob_all, end_prob_all) output = {} for i in range(len(examples)): question_id = examples[i][2] start_pred_pos = pred_answers[i][1] end_pred_pos = pred_answers[i][2] prediction = examples[i][0][start_pred_pos:end_pred_pos] output[question_id] = prediction f.write(json.dumps(output, indent=4, ensure_ascii=False) + "\n")
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--dataset_path", type=str, default="dataset.pt", help="Path of the preprocessed dataset.") parser.add_argument("--vocab_path", default=None, type=str, help="Path of the vocabulary file.") parser.add_argument("--spm_model_path", default=None, type=str, help="Path of the sentence piece model.") parser.add_argument("--tgt_vocab_path", default=None, type=str, help="Path of the target vocabulary file.") parser.add_argument("--tgt_spm_model_path", default=None, type=str, help="Path of the target sentence piece model.") parser.add_argument("--pretrained_model_path", type=str, default=None, help="Path of the pretrained model.") parser.add_argument("--output_model_path", type=str, required=True, help="Path of the output model.") parser.add_argument("--config_path", type=str, default="models/bert/base_config.json", help="Config file of model hyper-parameters.") # Training and saving options. parser.add_argument("--total_steps", type=int, default=100000, help="Total training steps.") parser.add_argument("--save_checkpoint_steps", type=int, default=10000, help="Specific steps to save model checkpoint.") parser.add_argument("--report_steps", type=int, default=100, help="Specific steps to print prompt.") parser.add_argument("--accumulation_steps", type=int, default=1, help="Specific steps to accumulate gradient.") parser.add_argument("--batch_size", type=int, default=32, help="Training batch size. The actual batch_size is [batch_size x world_size x accumulation_steps].") parser.add_argument("--instances_buffer_size", type=int, default=25600, help="The buffer size of instances in memory.") parser.add_argument("--labels_num", type=int, required=False, help="Number of prediction labels.") parser.add_argument("--dropout", type=float, default=0.1, help="Dropout value.") parser.add_argument("--seed", type=int, default=7, help="Random seed.") # Preprocess options. parser.add_argument("--tokenizer", choices=["bert", "char", "space", "xlmroberta"], default="bert", help="Specify the tokenizer." "Original Google BERT uses bert tokenizer." "Char tokenizer segments sentences into characters." "Space tokenizer segments sentences into words according to space." "Original XLM-RoBERTa uses xlmroberta tokenizer." ) parser.add_argument("--tgt_tokenizer", choices=["bert", "char", "space", "xlmroberta"], default="bert", help="Specify the tokenizer for target side.") # Model options. model_opts(parser) parser.add_argument("--tgt_embedding", choices=["word", "word_pos", "word_pos_seg", "word_sinusoidalpos"], default="word_pos_seg", help="Target embedding type.") parser.add_argument("--decoder", choices=["transformer"], default="transformer", help="Decoder type.") parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first", help="Pooling type.") parser.add_argument("--target", choices=["bert", "lm", "mlm", "bilm", "albert", "seq2seq", "t5", "cls", "prefixlm", "gsg", "bart"], default="bert", help="The training target of the pretraining model.") parser.add_argument("--tie_weights", action="store_true", help="Tie the word embedding and softmax weights.") parser.add_argument("--has_lmtarget_bias", action="store_true", help="Add bias on output_layer for lm target.") parser.add_argument("--deep_init", action="store_true", help="initialize bert model similar to gpt2 model." "scales initialization of projection layers by a " "factor of 1/sqrt(2N). Necessary to train bert " "models larger than BERT-Large.") # Masking options. parser.add_argument("--whole_word_masking", action="store_true", help="Whole word masking.") parser.add_argument("--span_masking", action="store_true", help="Span masking.") parser.add_argument("--span_geo_prob", type=float, default=0.2, help="Hyperparameter of geometric distribution for span masking.") parser.add_argument("--span_max_length", type=int, default=10, help="Max length for span masking.") # Optimizer options. optimization_opts(parser) # GPU options. parser.add_argument("--world_size", type=int, default=1, help="Total number of processes (GPUs) for training.") parser.add_argument("--gpu_ranks", default=[], nargs='+', type=int, help="List of ranks of each process." " Each process has a unique integer rank whose value is in the interval [0, world_size), and runs in a single GPU.") parser.add_argument("--master_ip", default="tcp://localhost:12345", type=str, help="IP-Port of master for training.") parser.add_argument("--backend", choices=["nccl", "gloo"], default="nccl", type=str, help="Distributed backend.") args = parser.parse_args() if args.target == "cls": assert args.labels_num is not None, "Cls target needs the denotation of the number of labels." # Load hyper-parameters from config file. if args.config_path: load_hyperparam(args) ranks_num = len(args.gpu_ranks) if args.world_size > 1: # Multiprocessing distributed mode. assert torch.cuda.is_available(), "No available GPUs." assert ranks_num <= args.world_size, "Started processes exceed `world_size` upper limit." assert ranks_num <= torch.cuda.device_count(), "Started processes exceeds the available GPUs." args.dist_train = True args.ranks_num = ranks_num print("Using distributed mode for training.") elif args.world_size == 1 and ranks_num == 1: # Single GPU mode. assert torch.cuda.is_available(), "No available GPUs." args.gpu_id = args.gpu_ranks[0] assert args.gpu_id < torch.cuda.device_count(), "Invalid specified GPU device." args.dist_train = False args.single_gpu = True print("Using GPU %d for training." % args.gpu_id) else: # CPU mode. assert ranks_num == 0, "GPUs are specified, please check the arguments." args.dist_train = False args.single_gpu = False print("Using CPU mode for training.") trainer.train_and_validate(args)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--pretrained_model_path", default=None, type=str, help="Path of the pretrained model.") parser.add_argument("--output_model_path", default="./models/classifier_model.bin", type=str, help="Path of the output model.") parser.add_argument("--vocab_path", default=None, type=str, help="Path of the vocabulary file.") parser.add_argument("--spm_model_path", default=None, type=str, help="Path of the sentence piece model.") parser.add_argument("--train_path", type=str, required=True, help="Path of the trainset.") parser.add_argument("--config_path", default="./models/bert_base_config.json", type=str, help="Path of the config file.") parser.add_argument("--train_features_path", type=str, required=True, help="Path of the train features for stacking.") # Model options. parser.add_argument("--batch_size", type=int, default=64, help="Batch size.") parser.add_argument("--seq_length", type=int, default=128, help="Sequence length.") parser.add_argument("--embedding", choices=["bert", "word"], default="bert", help="Emebdding type.") parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \ "cnn", "gatedcnn", "attn", "synt", \ "rcnn", "crnn", "gpt", "bilstm"], \ default="bert", help="Encoder type.") parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.") parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first", help="Pooling type.") parser.add_argument("--factorized_embedding_parameterization", action="store_true", help="Factorized embedding parameterization.") parser.add_argument("--parameter_sharing", action="store_true", help="Parameter sharing.") # Tokenizer options. parser.add_argument("--tokenizer", choices=["bert", "char", "space"], default="bert", help="Specify the tokenizer." "Original Google BERT uses bert tokenizer on Chinese corpus." "Char tokenizer segments sentences into characters." "Space tokenizer segments sentences into words according to space." ) # Optimizer options. parser.add_argument("--soft_targets", action='store_true', help="Train model with logits.") parser.add_argument("--soft_alpha", type=float, default=0.5, help="Weight of the soft targets loss.") parser.add_argument("--learning_rate", type=float, default=2e-5, help="Learning rate.") parser.add_argument("--warmup", type=float, default=0.1, help="Warm up value.") parser.add_argument("--fp16", action='store_true', help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit.") parser.add_argument("--fp16_opt_level", choices=["O0", "O1", "O2", "O3" ], default='O1', help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") # Training options. parser.add_argument("--dropout", type=float, default=0.5, help="Dropout.") parser.add_argument("--epochs_num", type=int, default=3, help="Number of epochs.") parser.add_argument("--report_steps", type=int, default=100, help="Specific steps to print prompt.") parser.add_argument("--seed", type=int, default=7, help="Random seed.") # Cross validation options. parser.add_argument("--folds_num", type=int, default=5, help="The number of folds for cross validation.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) set_seed(args.seed) # Count the number of labels. args.labels_num = count_labels_num(args.train_path) # Build tokenizer. args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args) # Training phase. dataset = read_dataset(args, args.train_path) instances_num = len(dataset) batch_size = args.batch_size instances_num_per_fold = instances_num // args.folds_num + 1 args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1 train_features = [] total_loss, result = 0., 0. acc, marco_f1 = 0., 0. for fold_id in range(args.folds_num): # Build classification model. model = Classifier(args) args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(args.device) load_or_initialize_parameters(args, model) optimizer, scheduler = build_optimizer(args, model) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level = args.fp16_opt_level) args.amp = amp if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) args.model = model trainset = dataset[0:fold_id*instances_num_per_fold] + dataset[(fold_id+1)*instances_num_per_fold:] random.shuffle(trainset) train_src = torch.LongTensor([example[0] for example in trainset]) train_tgt = torch.LongTensor([example[1] for example in trainset]) train_seg = torch.LongTensor([example[2] for example in trainset]) if args.soft_targets: train_soft_tgt = torch.FloatTensor([example[3] for example in trainset]) else: train_soft_tgt = None devset = dataset[fold_id*instances_num_per_fold:(fold_id+1)*instances_num_per_fold] dev_src = torch.LongTensor([example[0] for example in devset]) dev_tgt = torch.LongTensor([example[1] for example in devset]) dev_seg = torch.LongTensor([example[2] for example in devset]) dev_soft_tgt = None for epoch in range(1, args.epochs_num+1): model.train() for i, (src_batch, tgt_batch, seg_batch, soft_tgt_batch) in enumerate(batch_loader(batch_size, train_src, train_tgt, train_seg, train_soft_tgt)): loss = train_model(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_batch, soft_tgt_batch) total_loss += loss.item() if (i + 1) % args.report_steps == 0: print("Fold id: {}, Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".format(fold_id, epoch, i+1, total_loss / args.report_steps)) total_loss = 0. model.eval() for i, (src_batch, tgt_batch, seg_batch, soft_tgt_batch) in enumerate(batch_loader(batch_size, dev_src, dev_tgt, dev_seg, dev_soft_tgt)): src_batch = src_batch.to(args.device) seg_batch = seg_batch.to(args.device) with torch.no_grad(): _, logits = model(src_batch, None, seg_batch) prob = nn.Softmax(dim=1)(logits) prob = prob.cpu().numpy().tolist() train_features.extend(prob) output_model_name = ".".join(args.output_model_path.split(".")[:-1]) output_model_suffix = args.output_model_path.split(".")[-1] save_model(model, output_model_name+"-fold_"+str(fold_id)+"."+output_model_suffix) result = evaluate(args, devset) acc += result[0]/args.folds_num f1 = [] confusion = result[1] for i in range(confusion.size()[0]): p = confusion[i,i].item()/confusion[i,:].sum().item() r = confusion[i,i].item()/confusion[:,i].sum().item() f1.append(2*p*r/(p+r)) marco_f1 += sum(f1)/len(f1)/args.folds_num # print("Acc. : {:.4f}".format(result[0])) # print("Marco F1 : {:.4f}".format(sum(f1)/len(f1))) train_features = np.array(train_features) np.save(args.train_features_path, train_features) print("Acc. : {:.4f}".format(acc)) print("Marco F1 : {:.4f}".format(marco_f1))
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) finetune_opts(parser) parser.add_argument( "--max_choices_num", default=4, type=int, help= "The maximum number of cadicate answer, shorter than this will be padded." ) parser.add_argument( "--tokenizer", choices=["bert", "char", "space"], default="bert", help="Specify the tokenizer." "Original Google BERT uses bert tokenizer on Chinese corpus." "Char tokenizer segments sentences into characters." "Space tokenizer segments sentences into words according to space.") args = parser.parse_args() args.labels_num = args.max_choices_num # Load the hyperparameters from the config file. args = load_hyperparam(args) set_seed(args.seed) # Build tokenizer. args.tokenizer = str2tokenizer[args.tokenizer](args) # Build multiple choice model. model = MultipleChoice(args) # Load or initialize parameters. load_or_initialize_parameters(args, model) args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(args.device) # Training phase. trainset = read_dataset(args, args.train_path) random.shuffle(trainset) instances_num = len(trainset) batch_size = args.batch_size src = torch.LongTensor([example[0] for example in trainset]) tgt = torch.LongTensor([example[1] for example in trainset]) seg = torch.LongTensor([example[2] for example in trainset]) args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1 print("Batch size: ", batch_size) print("The number of training instances:", instances_num) optimizer, scheduler = build_optimizer(args, model) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) args.amp = amp if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) args.model = model total_loss, result, best_result = 0.0, 0.0, 0.0 print("Start training.") for epoch in range(1, args.epochs_num + 1): model.train() for i, (src_batch, tgt_batch, seg_batch, _) in enumerate(batch_loader(batch_size, src, tgt, seg)): loss = train_model(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_batch) total_loss += loss.item() if (i + 1) % args.report_steps == 0: print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}". format(epoch, i + 1, total_loss / args.report_steps)) total_loss = 0.0 result = evaluate(args, read_dataset(args, args.dev_path)) if result[0] > best_result: best_result = result[0] save_model(model, args.output_model_path) # Evaluation phase. if args.test_path is not None: print("Test set evaluation.") if torch.cuda.device_count() > 1: model.module.load_state_dict(torch.load(args.output_model_path)) else: model.load_state_dict(torch.load(args.output_model_path)) evaluate(args, read_dataset(args, args.test_path))
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--pretrained_model_path", default=None, type=str, help="Path of the pretrained model.") parser.add_argument("--dataset_path_list", default=[], nargs='+', type=str, help="Dataset path list.") parser.add_argument("--output_model_path", default="models/multitask_classifier_model.bin", type=str, help="Path of the output model.") parser.add_argument("--vocab_path", default=None, type=str, help="Path of the vocabulary file.") parser.add_argument("--spm_model_path", default=None, type=str, help="Path of the sentence piece model.") parser.add_argument("--config_path", default="models/bert_base_config.json", type=str, help="Path of the config file.") # Model options. model_opts(parser) parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first", help="Pooling type.") # Tokenizer options. parser.add_argument( "--tokenizer", choices=["bert", "char", "space"], default="bert", help="Specify the tokenizer." "Original Google BERT uses bert tokenizer on Chinese corpus." "Char tokenizer segments sentences into characters." "Space tokenizer segments sentences into words according to space.") # Optimizer options. optimization_opts(parser) # Training options. training_opts(parser) args = parser.parse_args() args.soft_targets = False # Load the hyperparameters from the config file. args = load_hyperparam(args) set_seed(args.seed) # Count the number of labels. args.labels_num_list = [ count_labels_num(os.path.join(path, "train.tsv")) for path in args.dataset_path_list ] args.datasets_num = len(args.dataset_path_list) # Build tokenizer. args.tokenizer = str2tokenizer[args.tokenizer](args) # Build multi-task classification model. model = MultitaskClassifier(args) # Load or initialize parameters. load_or_initialize_parameters(args, model) args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(args.device) args.model = model # Training phase. dataset_list = [ read_dataset(args, os.path.join(path, "train.tsv")) for path in args.dataset_path_list ] packed_dataset_list = [ pack_dataset(dataset, i, args.batch_size) for i, dataset in enumerate(dataset_list) ] packed_dataset_all = [] for packed_dataset in packed_dataset_list: packed_dataset_all += packed_dataset random.shuffle(packed_dataset_all) instances_num = sum([len(dataset) for dataset in dataset_list]) batch_size = args.batch_size args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1 print("Batch size: ", batch_size) print("The number of training instances:", instances_num) optimizer, scheduler = build_optimizer(args, model) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) args.amp = amp if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) total_loss, result, best_result = 0.0, 0.0, 0.0 print("Start training.") for epoch in range(1, args.epochs_num + 1): model.train() for i, (dataset_id, src_batch, tgt_batch, seg_batch) in enumerate(packed_dataset_all): if hasattr(model, "module"): model.module.change_dataset(dataset_id) else: model.change_dataset(dataset_id) loss = train_model(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_batch, None) total_loss += loss.item() if (i + 1) % args.report_steps == 0: print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}". format(epoch, i + 1, total_loss / args.report_steps)) total_loss = 0.0 for dataset_id, path in enumerate(args.dataset_path_list): args.labels_num = args.labels_num_list[dataset_id] if hasattr(model, "module"): model.module.change_dataset(dataset_id) else: model.change_dataset(dataset_id) result = evaluate( args, read_dataset(args, os.path.join(path, "dev.tsv"))) save_model(model, args.output_model_path)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--pretrained_model_path", default=None, type=str, help="Path of the pretrained model.") parser.add_argument("--output_model_path", default="models/classifier_model.bin", type=str, help="Path of the output model.") parser.add_argument("--train_path", type=str, required=True, help="Path of the trainset.") parser.add_argument("--config_path", default="models/bert/base_config.json", type=str, help="Path of the config file.") parser.add_argument("--train_features_path", type=str, required=True, help="Path of the train features for stacking.") # Model options. model_opts(parser) parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first", help="Pooling type.") # Tokenizer options. tokenizer_opts(parser) # Optimization options. optimization_opts(parser) parser.add_argument("--soft_targets", action='store_true', help="Train model with logits.") parser.add_argument("--soft_alpha", type=float, default=0.5, help="Weight of the soft targets loss.") # Training options. training_opts(parser) # Cross validation options. parser.add_argument("--folds_num", type=int, default=5, help="The number of folds for cross validation.") adv_opts(parser) args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Get logger. args.logger = get_logger(args) set_seed(args.seed) # Count the number of labels. args.labels_num = count_labels_num(args.train_path) # Build tokenizer. args.tokenizer = str2tokenizer[args.tokenizer](args) # Training phase. dataset = read_dataset(args, args.train_path) instances_num = len(dataset) batch_size = args.batch_size instances_num_per_fold = instances_num // args.folds_num + 1 args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1 train_features = [] total_loss, result = 0.0, 0.0 acc, marco_f1 = 0.0, 0.0 for fold_id in range(args.folds_num): # Build classification model. model = Classifier(args) args.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") model = model.to(args.device) load_or_initialize_parameters(args, model) optimizer, scheduler = build_optimizer(args, model) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) args.amp = amp if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) args.model = model if args.use_adv: args.adv_method = str2adv[args.adv_type](model) trainset = dataset[0:fold_id * instances_num_per_fold] + dataset[ (fold_id + 1) * instances_num_per_fold:] devset = dataset[fold_id * instances_num_per_fold:(fold_id + 1) * instances_num_per_fold] dev_src = torch.LongTensor([example[0] for example in devset]) dev_tgt = torch.LongTensor([example[1] for example in devset]) dev_seg = torch.LongTensor([example[2] for example in devset]) dev_soft_tgt = None for epoch in range(1, args.epochs_num + 1): random.shuffle(trainset) train_src = torch.LongTensor([example[0] for example in trainset]) train_tgt = torch.LongTensor([example[1] for example in trainset]) train_seg = torch.LongTensor([example[2] for example in trainset]) if args.soft_targets: train_soft_tgt = torch.FloatTensor( [example[3] for example in trainset]) else: train_soft_tgt = None model.train() for i, (src_batch, tgt_batch, seg_batch, soft_tgt_batch) in enumerate( batch_loader(batch_size, train_src, train_tgt, train_seg, train_soft_tgt)): loss = train_model(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_batch, soft_tgt_batch) total_loss += loss.item() if (i + 1) % args.report_steps == 0: args.logger.info( "Fold id: {}, Epoch id: {}, Training steps: {}, Avg loss: {:.3f}" .format(fold_id, epoch, i + 1, total_loss / args.report_steps)) total_loss = 0.0 model.eval() for i, (src_batch, tgt_batch, seg_batch, soft_tgt_batch) in enumerate( batch_loader(batch_size, dev_src, dev_tgt, dev_seg, dev_soft_tgt)): src_batch = src_batch.to(args.device) seg_batch = seg_batch.to(args.device) with torch.no_grad(): _, logits = model(src_batch, None, seg_batch) prob = nn.Softmax(dim=1)(logits) prob = prob.cpu().numpy().tolist() train_features.extend(prob) output_model_name = ".".join(args.output_model_path.split(".")[:-1]) output_model_suffix = args.output_model_path.split(".")[-1] save_model( model, output_model_name + "-fold_" + str(fold_id) + "." + output_model_suffix) result = evaluate(args, devset) acc += result[0] / args.folds_num f1 = [] confusion = result[1] eps = 1e-9 for i in range(confusion.size()[0]): p = confusion[i, i].item() / (confusion[i, :].sum().item() + eps) r = confusion[i, i].item() / (confusion[:, i].sum().item() + eps) f1.append(2 * p * r / (p + r + eps)) marco_f1 += sum(f1) / len(f1) / args.folds_num train_features = np.array(train_features) np.save(args.train_features_path, train_features) args.logger.info("Acc. : {:.4f}".format(acc)) args.logger.info("Marco F1 : {:.4f}".format(marco_f1))
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) finetune_opts(parser) tokenizer_opts(parser) args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) set_seed(args.seed) # Count the number of labels. args.labels_num = count_labels_num(args.train_path) # Build tokenizer. args.tokenizer = str2tokenizer[args.tokenizer](args) # Build classification model. model = SiameseClassifier(args) # Load or initialize parameters. load_or_initialize_parameters(args, model) # Get logger. args.logger = init_logger(args) args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(args.device) # Training phase. trainset = read_dataset(args, args.train_path) instances_num = len(trainset) batch_size = args.batch_size args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1 args.logger.info("Batch size: {}".format(batch_size)) args.logger.info( "The number of training instances: {}".format(instances_num)) optimizer, scheduler = build_optimizer(args, model) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) args.amp = amp if torch.cuda.device_count() > 1: args.logger.info("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) args.model = model total_loss, result, best_result = 0.0, 0.0, 0.0 args.logger.info("Start training.") for epoch in range(1, args.epochs_num + 1): random.shuffle(trainset) src_a = torch.LongTensor([example[0][0] for example in trainset]) src_b = torch.LongTensor([example[0][1] for example in trainset]) tgt = torch.LongTensor([example[1] for example in trainset]) seg_a = torch.LongTensor([example[2][0] for example in trainset]) seg_b = torch.LongTensor([example[2][1] for example in trainset]) model.train() for i, (src_batch, tgt_batch, seg_batch) in enumerate( batch_loader(batch_size, (src_a, src_b), tgt, (seg_a, seg_b))): loss = train_model(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_batch) total_loss += loss.item() if (i + 1) % args.report_steps == 0: args.logger.info( "Epoch id: {}, Training steps: {}, Avg loss: {:.3f}". format(epoch, i + 1, total_loss / args.report_steps)) total_loss = 0.0 result = evaluate(args, read_dataset(args, args.dev_path)) if result[0] > best_result: best_result = result[0] save_model(model, args.output_model_path) # Evaluation phase. if args.test_path is not None: args.logger.info("Test set evaluation.") if torch.cuda.device_count() > 1: args.model.module.load_state_dict( torch.load(args.output_model_path)) else: args.model.load_state_dict(torch.load(args.output_model_path)) evaluate(args, read_dataset(args, args.test_path))
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) infer_opts(parser) parser.add_argument( "--max_choices_num", default=4, type=int, help= "The maximum number of cadicate answer, shorter than this will be padded." ) parser.add_argument( "--tokenizer", choices=["bert", "char", "space"], default="bert", help="Specify the tokenizer." "Original Google BERT uses bert tokenizer on Chinese corpus." "Char tokenizer segments sentences into characters." "Space tokenizer segments sentences into words according to space.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = str2tokenizer[args.tokenizer](args) # Build classification model and load parameters. model = MultipleChoice(args) model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) dataset = read_dataset(args, args.test_path) src = torch.LongTensor([example[0] for example in dataset]) tgt = torch.LongTensor([example[1] for example in dataset]) seg = torch.LongTensor([example[2] for example in dataset]) batch_size = args.batch_size instances_num = src.size()[0] print("The number of prediction instances: ", instances_num) model.eval() with open(args.test_path) as f: data = json.load(f) question_ids = [] for i in range(len(data)): questions = data[i][1] for question in questions: question_ids.append(question["id"]) index = 0 with open(args.prediction_path, "w") as f: for i, (src_batch, _, seg_batch, _) in enumerate(batch_loader(batch_size, src, tgt, seg)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) with torch.no_grad(): _, logits = model(src_batch, None, seg_batch) pred = torch.argmax(logits, dim=1) pred = pred.cpu().numpy().tolist() for j in range(len(pred)): output = {} output["id"] = question_ids[index] index += 1 output["label"] = int(pred[j]) f.write(json.dumps(output)) f.write("\n")
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--pretrained_model_path", default=None, type=str, required=True, help="Path of the pretrained model.") parser.add_argument("--output_model_path", default="./models/tagger_model.bin", type=str, help="Path of the output model.") parser.add_argument("--output_encoder", default="./luke-models/", type=str, help="Path of the output luke model.") parser.add_argument("--suffix_file_encoder", default="encoder", type=str, help="output file suffix luke model.") parser.add_argument("--vocab_path", default="./models/google_vocab.txt", type=str, help="Path of the vocabulary file.") parser.add_argument("--train_path", type=str, required=True, help="Path of the trainset.") parser.add_argument("--dev_path", type=str, required=True, help="Path of the devset.") parser.add_argument("--test_path", type=str, required=True, help="Path of the testset.") parser.add_argument("--config_path", default="./models/google_config.json", type=str, help="Path of the config file.") parser.add_argument("--output_file_prefix", type=str, required=True, help="Prefix for file output.") parser.add_argument("--log_file", default='app.log') # Model options. parser.add_argument("--seq_length", default=256, type=int, help="Sequence length.") parser.add_argument("--classifier", choices=["mlp", "lstm", "lstm_crf", "lstm_ncrf"], default="mlp", help="Classifier type.") parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.") parser.add_argument('--freeze_encoder_weights', action='store_true', help="Enable to freeze the encoder weigths.") # Subword options. parser.add_argument("--subword_type", choices=["none", "char"], default="none", help="Subword feature type.") parser.add_argument("--sub_vocab_path", type=str, default="models/sub_vocab.txt", help="Path of the subword vocabulary file.") parser.add_argument("--subencoder", choices=["avg", "lstm", "gru", "cnn"], default="avg", help="Subencoder type.") parser.add_argument("--sub_layers_num", type=int, default=2, help="The number of subencoder layers.") # Training options. parser.add_argument("--dropout", type=float, default=0.1, help="Dropout.") parser.add_argument("--epochs_num", type=int, default=0, help="Number of epochs.") parser.add_argument("--gradient_accumulation_steps", type=int, default=2, help="Number of steps to accumulate the gradient.") parser.add_argument("--report_steps", type=int, default=200, help="Specific steps to print prompt.") parser.add_argument("--seed", type=int, default=35, help="Random seed.") parser.add_argument("--batch_size", type=int, default=32, help="Batch_size.") parser.add_argument("--num_train_steps", type=int, default=20000, help="Max steps to be trained.") parser.add_argument("--patience", type=int, default=8000, help="Specific steps to wait until stops training.") # Optimizer options. parser.add_argument("--learning_rate", default=1e-5, type=float) parser.add_argument("--lr_schedule", default="warmup_linear", type=str, choices=["warmup_linear", "warmup_constant"]) parser.add_argument("--weight_decay", default=0.01, type=float) parser.add_argument("--max_grad_norm", default=0.0, type=float) parser.add_argument("--adam_b1", default=0.9, type=float) parser.add_argument("--adam_b2", default=0.999, type=float) parser.add_argument("--adam_eps", default=1e-8, type=float) parser.add_argument("--adam_correct_bias", action='store_true') parser.add_argument("--warmup_proportion", default=0.006, type=float) parser.add_argument("--freeze_proportions", default=0.0, type=float) parser.add_argument("--wandb", action='store_true', help="Enable wandb logging") # kg parser.add_argument("--kg_name", type=str, help="KG name or path") parser.add_argument("--use_kg", action='store_true', help="Enable the use of KG.") parser.add_argument("--padding", action='store_true', help="Enable padding.") parser.add_argument( "--truncate", action='store_true', help="Enable truncation if length is more than seq length.") parser.add_argument("--shuffle", action='store_true', help="Enable shuffling during training.") parser.add_argument("--dry_run", action='store_true', help="Dry run to test the implementation.") parser.add_argument( "--voting_choicer", action='store_true', help="Enable the Voting choicer to select the entity type.") parser.add_argument("--eval_kg_tag", action='store_true', help="Enable to include [ENT] tag in evaluation.") parser.add_argument("--use_subword_tag", action='store_true', help="Enable to use separate tag for subword splits.") parser.add_argument("--debug", action='store_true', help="Enable debug.") parser.add_argument("--reverse_order", action='store_true', help="Reverse the feature selection order.") parser.add_argument("--max_entities", default=2, type=int, help="Number of KG features.") parser.add_argument("--eval_range_with_types", action='store_true', help="Enable to eval range with types.") args = parser.parse_args() # Load the hyperparameters of the config file. args = load_hyperparam(args) set_seed(args.seed) logging.basicConfig(filename=args.log_file, filemode='w', format=fmt) labels_map = {"[PAD]": 0, "[ENT]": 1, "[X]": 2, "[CLS]": 3, "[SEP]": 4} begin_ids = [] # Find tagging labels for file in (args.train_path, args.dev_path, args.test_path): with open(file, mode="r", encoding="utf-8") as f: for line_id, line in enumerate(f): if line_id == 0: continue labels = line.strip().split("\t")[0].split() for l in labels: if l not in labels_map: if l.startswith("B") or l.startswith("S"): begin_ids.append(len(labels_map)) # check if I-TAG exists infix = l[1] tag = l[2:] inner_tag = f'I{infix}{tag}' if inner_tag not in labels_map: labels_map[inner_tag] = len(labels_map) labels_map[l] = len(labels_map) idx_to_label = {labels_map[key]: key for key in labels_map} print(begin_ids) print("Labels: ", labels_map) args.labels_num = len(labels_map) # Build knowledge graph. if args.kg_name == 'none': kg_file = [] else: kg_file = args.kg_name # Load Luke model. model_archive = ModelArchive.load(args.pretrained_model_path) tokenizer = model_archive.tokenizer # Handling space character in roberta tokenizer byte_encoder = bytes_to_unicode() byte_decoder = {v: k for k, v in byte_encoder.items()} # Load the pretrained model encoder = LukeModel(model_archive.config) encoder.load_state_dict(model_archive.state_dict, strict=False) kg = KnowledgeGraph(kg_file=kg_file, tokenizer=tokenizer) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.device = device # Build sequence labeling model. classifiers = { "mlp": LukeTaggerMLP, "lstm": LukeTaggerLSTM, "lstm_crf": LukeTaggerLSTMCRF, "lstm_ncrf": LukeTaggerLSTMNCRF } logger.info(f'The selected classifier is:{classifiers[args.classifier]}') model = classifiers[args.classifier](args, encoder) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = nn.DataParallel(model) model = model.to(device) # Read dataset. def read_dataset(path): dataset = [] count = 0 with open(path, mode="r", encoding="utf8") as f: f.readline() tokens, labels = [], [] for line_id, line in enumerate(f): fields = line.strip().split("\t") if len(fields) == 2: labels, tokens = fields elif len(fields) == 3: labels, tokens, cls = fields else: print( f'The data is not in accepted format at line no:{line_id}.. Ignored' ) continue tokens, pos, vm, tag = kg.add_knowledge_with_vm( args, [tokens], [labels]) tokens = tokens[0] pos = pos[0] vm = vm[0].astype("bool") tag = tag[0] # tokens = tokenizer.convert_tokens_to_ids([tokenizer.cls_token] + tokens + [tokenizer.sep_token]) non_pad_tokens = [ tok for tok in tokens if tok != tokenizer.pad_token ] num_tokens = len(non_pad_tokens) num_pad = len(tokens) - num_tokens labels = [config.CLS_TOKEN ] + labels.split(" ") + [config.SEP_TOKEN] new_labels = [] j = 0 joiner = '-' for i in range(len(tokens)): if tag[i] == 0 and tokens[i] != tokenizer.pad_token: cur_type = labels[j] if cur_type != 'O': try: joiner = cur_type[1] prev_label = cur_type[2:] except: logger.info( f'The label:{cur_type} is converted to O') prev_label = 'O' j += 1 new_labels.append('O') continue else: prev_label = cur_type new_labels.append(cur_type) j += 1 elif tag[i] == 1 and tokens[ i] != tokenizer.pad_token: # 是添加的实体 new_labels.append('[ENT]') elif tag[i] == 2: if prev_label == 'O': new_labels.append('O') else: if args.use_subword_tag: new_labels.append('[X]') else: new_labels.append(f'I{joiner}' + prev_label) else: new_labels.append(PAD_TOKEN) new_labels = [labels_map[l] for l in new_labels] # print(tokens) # print(labels) # print(tag) if num_pad != 0: print(num_pad) exit() mask = [1] * (num_tokens) + [0] * num_pad word_segment_ids = [0] * (len(tokens)) # print(len(tokens)) # print(len(tag)) # exit() # print(tokenizer.pad_token_id) # for i in range(len(tokens)): # if tag[i] == 0 and tokens[i] != tokenizer.pad_token: # new_labels.append(labels[j]) # j += 1 # elif tag[i] == 1 and tokens[i] != tokenizer.pad_token: # 是添加的实体 # new_labels.append(labels_map['[ENT]']) # elif tag[i] == 2: # if args.use_subword_tag: # new_labels.append(labels_map['[X]']) # else: # new_labels.append(labels_map['[ENT]']) # else: # new_labels.append(labels_map[PAD_TOKEN]) # print(labels) # print(new_labels) # print([idx_to_label.get(key) for key in labels]) # print([idx_to_label.get(key) for key in labels]) # print(mask) # print(pos) # print(word_segment_ids) # print(tokens) # tokens = tokenizer.convert_tokens_to_ids([tokenizer.cls_token] + tokens + [tokenizer.sep_token]) tokens = tokenizer.convert_tokens_to_ids(tokens) # print(tokens) # exit() assert len(tokens) == len(new_labels), AssertionError( "The length of token and label is not matching") dataset.append( [tokens, new_labels, mask, pos, vm, tag, word_segment_ids]) # Enable dry rune if args.dry_run: count += 1 if count == 100: break return dataset # Evaluation function. def evaluate(args, is_test, final=False): if is_test: dataset = read_dataset(args.test_path) else: dataset = read_dataset(args.dev_path) instances_num = len(dataset) batch_size = args.batch_size if is_test: logger.info(f"Batch size:{batch_size}") print(f"The number of test instances:{instances_num}") true_labels_all = [] predicted_labels_all = [] confusion = torch.zeros(len(labels_map), len(labels_map), dtype=torch.long) model.eval() test_batcher = Batcher(args, dataset, token_pad=tokenizer.pad_token_id, label_pad=labels_map[PAD_TOKEN]) for i, (input_ids_batch, label_ids_batch, mask_ids_batch, pos_ids_batch, vm_ids_batch, segment_ids_batch) in enumerate(test_batcher): input_ids_batch = input_ids_batch.to(device) label_ids_batch = label_ids_batch.to(device) mask_ids_batch = mask_ids_batch.to(device) pos_ids_batch = pos_ids_batch.to(device) vm_ids_batch = vm_ids_batch.long().to(device) segment_ids_batch = segment_ids_batch.long().to(device) pred, logits, scores = model(input_ids_batch, segment_ids_batch, mask_ids_batch, label_ids_batch, pos_ids_batch, vm_ids_batch, use_kg=args.use_kg) for pred_sample, gold_sample, mask in zip(pred, label_ids_batch, mask_ids_batch): pred_labels = [ idx_to_label.get(key) for key in pred_sample.tolist() ] gold_labels = [ idx_to_label.get(key) for key in gold_sample.tolist() ] num_labels = sum(mask) # Exclude the [CLS], and [SEP] tokens pred_labels = pred_labels[1:num_labels - 1] true_labels = gold_labels[1:num_labels - 1] pred_labels = [p.replace('_NOKG', '') for p in pred_labels] true_labels = [t.replace('_NOKG', '') for t in true_labels] true_labels, pred_labels = filter_kg_labels( true_labels, pred_labels) pred_labels = [p.replace('_', '-') for p in pred_labels] true_labels = [t.replace('_', '-') for t in true_labels] biluo_tags_predicted = get_bio(pred_labels) biluo_tags_true = get_bio(true_labels) if len(biluo_tags_predicted) != len(biluo_tags_true): logger.error( 'The length of the predicted labels is not same as that of true labels..' ) exit() predicted_labels_all.append(biluo_tags_predicted) true_labels_all.append(biluo_tags_true) if final: with open(f'{args.output_file_prefix}_predictions.txt', 'a') as p, \ open(f'{args.output_file_prefix}_gold.txt', 'a') as g: p.write('\n'.join([' '.join(l) for l in predicted_labels_all])) g.write('\n'.join([' '.join(l) for l in true_labels_all])) return dict( f1=seqeval.metrics.f1_score(true_labels_all, predicted_labels_all), precision=seqeval.metrics.precision_score(true_labels_all, predicted_labels_all), recall=seqeval.metrics.recall_score(true_labels_all, predicted_labels_all), f1_span=f1_score_span(true_labels_all, predicted_labels_all), precision_span=precision_score_span(true_labels_all, predicted_labels_all), recall_span=recall_score_span(true_labels_all, predicted_labels_all), ) # Training phase. logger.info("Start training.") instances = read_dataset(args.train_path) instances_num = len(instances) batch_size = args.batch_size if args.epochs_num: args.num_train_steps = int( instances_num * args.epochs_num / batch_size) + 1 unfreeze_steps = 0 model_frozen = False if args.freeze_proportions != 0.0: unfreeze_steps = int( args.num_train_steps * args.freeze_proportions) + 1 logger.info( f'Two phase training is enabled with model unfreeze at:{unfreeze_steps}' ) # freeze the model model.freeze() model_frozen = True logger.info(f"Batch size:{batch_size}") logger.info(f"The number of training instances:{instances_num}") train_batcher = Batcher(args, instances, token_pad=tokenizer.pad_token_id, label_pad=labels_map[PAD_TOKEN]) optimizer = create_optimizer(args, model) scheduler = create_scheduler(args, optimizer) total_loss = 0. best_f1 = 0.0 # Dry evaluate # evaluate(args, True) def maybe_no_sync(step): if (hasattr(model, "no_sync") and (step + 1) % args.gradient_accumulation_steps != 0): return model.no_sync() else: return contextlib.ExitStack() # YOU MUST LOG INTO WANDB WITH YOUR OWN ACCOUNT if args.wandb: import wandb wandb.init(project="kbert_pretrain") # args.update(wandb.config) print(f'new args{args}') else: wandb = None global_steps = 0 early_stop_steps = 0 epoch = 0 with tqdm(total=args.num_train_steps) as pbar: while True: model.train() for step, (input_ids_batch, label_ids_batch, mask_ids_batch, pos_ids_batch, vm_ids_batch, segment_ids_batch) in enumerate(train_batcher): input_ids_batch = input_ids_batch.to(device) label_ids_batch = label_ids_batch.to(device) mask_ids_batch = mask_ids_batch.to(device) pos_ids_batch = pos_ids_batch.to(device) vm_ids_batch = vm_ids_batch.long().to(device) segment_ids_batch = segment_ids_batch.long().to(device) loss, logits = model.score(input_ids_batch, segment_ids_batch, mask_ids_batch, label_ids_batch, pos_ids_batch, vm_ids_batch, use_kg=args.use_kg) if torch.cuda.device_count() > 1: loss = torch.mean(loss) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps with maybe_no_sync(step): loss.backward() total_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.max_grad_norm != 0.0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() model.zero_grad() optimizer.zero_grad() pbar.set_description("epoch: %d loss: %.7f" % (epoch, loss.item())) pbar.update() global_steps += 1 if global_steps % args.report_steps == 0: logger.info("Epoch id: {}, Global Steps:{}, Avg loss: " "{:.10f}".format( epoch, global_steps + 1, total_loss / args.report_steps)) # Evaluation phase. logger.info("Start evaluate on dev dataset.") results = evaluate(args, False) logger.info(results) logger.info("Start evaluation on test dataset.") results_test = evaluate(args, True) logger.info(results_test) avg_loss = total_loss / args.report_steps if args.wandb: # Log the loss and accuracy values at the end of each epoch wandb.log({ "steps": global_steps, "train Loss": avg_loss, "valid_acc": results['f1'], "test_acc": results_test['f1'], "learning_rate": args.learning_rate, "batch_size": args.batch_size, "lr_schedule": args.lr_schedule, "weight_decay": args.weight_decay, "max_grad_norm": args.max_grad_norm, }) if results['f1'] > best_f1: best_f1 = results['f1'] early_stop_steps = 0 save_model(model, args.output_model_path) save_encoder(args, encoder, suffix=args.suffix_file_encoder) else: early_stop_steps += args.report_steps # Change back the model for training model.train() total_loss = 0. if model_frozen and global_steps >= unfreeze_steps: # unfreeze the model and start training logger.info('The encoder is unfrozen for training.') model.unfreeze() model_frozen = False if global_steps >= args.num_train_steps: # Training completed logger.info('The training is completed!') break if early_stop_steps >= args.patience: # Early stopping logger.info('The early stopping is triggered!') break if model_frozen and global_steps >= unfreeze_steps: # unfreeze the model and start training logger.info('The encoder is unfrozen for training.') model.unfreeze() model_frozen = False if global_steps >= args.num_train_steps: # Training completed logger.info('The training is completed!') break if early_stop_steps >= args.patience: # Early stopping logger.info('The early stopping is triggered!') break epoch += 1 # Evaluation phase. logger.info("Final evaluation on test dataset.") if torch.cuda.device_count() > 1: model.module.load_state_dict(torch.load(args.output_model_path)) else: model.load_state_dict(torch.load(args.output_model_path)) results_final = evaluate(args, True, final=True) logger.info(results_final)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--pretrained_model_path", default=None, type=str, help="Path of the pretrained model.") parser.add_argument("--output_model_path", default="./models/ner_model.bin", type=str, help="Path of the output model.") parser.add_argument("--vocab_path", type=str, required=True, help="Path of the vocabulary file.") parser.add_argument("--train_path", type=str, required=True, help="Path of the trainset.") parser.add_argument("--dev_path", type=str, required=True, help="Path of the devset.") parser.add_argument("--test_path", type=str, help="Path of the testset.") parser.add_argument("--config_path", default="./models/bert_base_config.json", type=str, help="Path of the config file.") # Model options. parser.add_argument("--batch_size", type=int, default=32, help="Batch_size.") parser.add_argument("--seq_length", default=128, type=int, help="Sequence length.") parser.add_argument("--embedding", choices=["bert", "word"], default="bert", help="Emebdding type.") parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \ "cnn", "gatedcnn", "attn", \ "rcnn", "crnn", "gpt", "bilstm"], \ default="bert", help="Encoder type.") parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.") # Subword options. parser.add_argument("--subword_type", choices=["none", "char"], default="none", help="Subword feature type.") parser.add_argument("--sub_vocab_path", type=str, default="models/sub_vocab.txt", help="Path of the subword vocabulary file.") parser.add_argument("--subencoder", choices=["avg", "lstm", "gru", "cnn"], default="avg", help="Subencoder type.") parser.add_argument("--sub_layers_num", type=int, default=2, help="The number of subencoder layers.") # Optimizer options. parser.add_argument("--learning_rate", type=float, default=2e-5, help="Learning rate.") parser.add_argument("--warmup", type=float, default=0.1, help="Warm up value.") # Training options. parser.add_argument("--dropout", type=float, default=0.1, help="Dropout.") parser.add_argument("--epochs_num", type=int, default=3, help="Number of epochs.") parser.add_argument("--report_steps", type=int, default=100, help="Specific steps to print prompt.") parser.add_argument("--seed", type=int, default=7, help="Random seed.") args = parser.parse_args() # Load the hyperparameters of the config file. args = load_hyperparam(args) set_seed(args.seed) labels_map = {"[PAD]": 0} begin_ids = [] # Find tagging labels with open(args.train_path, mode="r", encoding="utf-8") as f: for line_id, line in enumerate(f): if line_id == 0: continue labels = line.strip().split("\t")[1].split() for l in labels: if l not in labels_map: if l.startswith("B") or l.startswith("S"): begin_ids.append(len(labels_map)) labels_map[l] = len(labels_map) print("Labels: ", labels_map) args.labels_num = len(labels_map) # Load vocabulary. vocab = Vocab() vocab.load(args.vocab_path) args.vocab = vocab # Build bert model. # A pseudo target is added. args.target = "bert" model = build_model(args) # Load or initialize parameters. if args.pretrained_model_path is not None: # Initialize with pretrained model. model.load_state_dict(torch.load(args.pretrained_model_path), strict=False) else: # Initialize with normal distribution. for n, p in list(model.named_parameters()): if 'gamma' not in n and 'beta' not in n: p.data.normal_(0, 0.02) # Build sequence labeling model. model = BertTagger(args, model) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = nn.DataParallel(model) model = model.to(device) # Datset loader. def batch_loader(batch_size, input_ids, label_ids, mask_ids): instances_num = input_ids.size()[0] for i in range(instances_num // batch_size): input_ids_batch = input_ids[i * batch_size:(i + 1) * batch_size, :] label_ids_batch = label_ids[i * batch_size:(i + 1) * batch_size, :] mask_ids_batch = mask_ids[i * batch_size:(i + 1) * batch_size, :] yield input_ids_batch, label_ids_batch, mask_ids_batch if instances_num > instances_num // batch_size * batch_size: input_ids_batch = input_ids[instances_num // batch_size * batch_size:, :] label_ids_batch = label_ids[instances_num // batch_size * batch_size:, :] mask_ids_batch = mask_ids[instances_num // batch_size * batch_size:, :] yield input_ids_batch, label_ids_batch, mask_ids_batch # Read dataset. def read_dataset(path): dataset = [] with open(path, mode="r", encoding="utf-8") as f: f.readline() tokens, labels = [], [] for line_id, line in enumerate(f): tokens, labels = line.strip().split("\t") tokens = [vocab.get(t) for t in tokens.split(" ")] labels = [labels_map[l] for l in labels.split(" ")] mask = [1] * len(tokens) if len(tokens) > args.seq_length: tokens = tokens[:args.seq_length] labels = labels[:args.seq_length] mask = mask[:args.seq_length] while len(tokens) < args.seq_length: tokens.append(0) labels.append(0) mask.append(0) dataset.append([tokens, labels, mask]) return dataset # Evaluation function. def evaluate(args, is_test): if is_test: dataset = read_dataset(args.test_path) else: dataset = read_dataset(args.dev_path) input_ids = torch.LongTensor([sample[0] for sample in dataset]) label_ids = torch.LongTensor([sample[1] for sample in dataset]) mask_ids = torch.LongTensor([sample[2] for sample in dataset]) instances_num = input_ids.size(0) batch_size = args.batch_size if is_test: print("Batch size: ", batch_size) print("The number of test instances:", instances_num) correct = 0 gold_entities_num = 0 pred_entities_num = 0 confusion = torch.zeros(len(labels_map), len(labels_map), dtype=torch.long) model.eval() for i, (input_ids_batch, label_ids_batch, mask_ids_batch) in enumerate( batch_loader(batch_size, input_ids, label_ids, mask_ids)): input_ids_batch = input_ids_batch.to(device) label_ids_batch = label_ids_batch.to(device) mask_ids_batch = mask_ids_batch.to(device) loss, _, pred, gold = model(input_ids_batch, label_ids_batch, mask_ids_batch) for j in range(gold.size()[0]): if gold[j].item() in begin_ids: gold_entities_num += 1 for j in range(pred.size()[0]): if pred[j].item( ) in begin_ids and gold[j].item() != labels_map["[PAD]"]: pred_entities_num += 1 pred_entities_pos = [] gold_entities_pos = [] start, end = 0, 0 for j in range(gold.size()[0]): if gold[j].item() in begin_ids: start = j for k in range(j + 1, gold.size()[0]): if gold[k].item( ) == labels_map["[PAD]"] or gold[k].item( ) == labels_map["O"] or gold[k].item() in begin_ids: end = k - 1 break else: end = gold.size()[0] - 1 gold_entities_pos.append((start, end)) for j in range(pred.size()[0]): if pred[j].item( ) in begin_ids and gold[j].item() != labels_map["[PAD]"]: start = j for k in range(j + 1, pred.size()[0]): if pred[k].item( ) == labels_map["[PAD]"] or pred[k].item( ) == labels_map["O"] or pred[k].item() in begin_ids: end = k - 1 break else: end = pred.size()[0] - 1 pred_entities_pos.append((start, end)) for entity in pred_entities_pos: if entity not in gold_entities_pos: continue for j in range(entity[0], entity[1] + 1): if gold[j].item() != pred[j].item(): break else: correct += 1 print("Report precision, recall, and f1:") p = correct / pred_entities_num r = correct / gold_entities_num f1 = 2 * p * r / (p + r) print("{:.3f}, {:.3f}, {:.3f}".format(p, r, f1)) return f1 # Training phase. print("Start training.") instances = read_dataset(args.train_path) input_ids = torch.LongTensor([ins[0] for ins in instances]) label_ids = torch.LongTensor([ins[1] for ins in instances]) mask_ids = torch.LongTensor([ins[2] for ins in instances]) instances_num = input_ids.size(0) batch_size = args.batch_size train_steps = int(instances_num * args.epochs_num / batch_size) + 1 print("Batch size: ", batch_size) print("The number of training instances:", instances_num) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] ooptimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=train_steps * args.warmup, t_total=train_steps) total_loss = 0. f1 = 0.0 best_f1 = 0.0 for epoch in range(1, args.epochs_num + 1): model.train() for i, (input_ids_batch, label_ids_batch, mask_ids_batch) in enumerate( batch_loader(batch_size, input_ids, label_ids, mask_ids)): model.zero_grad() input_ids_batch = input_ids_batch.to(device) label_ids_batch = label_ids_batch.to(device) mask_ids_batch = mask_ids_batch.to(device) loss, _, _, _ = model(input_ids_batch, label_ids_batch, mask_ids_batch) if torch.cuda.device_count() > 1: loss = torch.mean(loss) total_loss += loss.item() if (i + 1) % args.report_steps == 0: print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}". format(epoch, i + 1, total_loss / args.report_steps)) total_loss = 0. loss.backward() optimizer.step() scheduler.step() f1 = evaluate(args, False) if f1 > best_f1: best_f1 = f1 save_model(model, args.output_model_path) else: continue # Evaluation phase. if args.test_path is not None: print("Test set evaluation.") model = load_model(model, args.output_model_path) evaluate(args, True)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--load_model_path", default=None, type=str, help="Path of the classfier model.") parser.add_argument("--vocab_path", default=None, type=str, help="Path of the vocabulary file.") parser.add_argument("--spm_model_path", default=None, type=str, help="Path of the sentence piece model.") parser.add_argument("--test_path", type=str, help="Path of the testset.") parser.add_argument("--test_features_path", default=None, type=str, help="Path of the test features for stacking.") parser.add_argument("--config_path", default="models/bert/base_config.json", type=str, help="Path of the config file.") # Model options. model_opts(parser) parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first", help="Pooling type.") # Inference options. parser.add_argument("--batch_size", type=int, default=64, help="Batch size.") parser.add_argument("--seq_length", type=int, default=128, help="Sequence length.") parser.add_argument("--labels_num", type=int, required=True, help="Number of prediction labels.") # Tokenizer options. tokenizer_opts(parser) # Output options. parser.add_argument("--output_logits", action="store_true", help="Write logits to output file.") parser.add_argument("--output_prob", action="store_true", help="Write probabilities to output file.") # Cross validation options. parser.add_argument("--folds_num", type=int, default=5, help="The number of folds for cross validation.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = str2tokenizer[args.tokenizer](args) # Build classification model and load parameters. args.soft_targets, args.soft_alpha = False, False dataset = read_dataset(args, args.test_path) src = torch.LongTensor([sample[0] for sample in dataset]) seg = torch.LongTensor([sample[1] for sample in dataset]) batch_size = args.batch_size instances_num = src.size()[0] print("The number of prediction instances: ", instances_num) test_features = [[] for _ in range(args.folds_num)] for fold_id in range(args.folds_num): load_model_name = ".".join(args.load_model_path.split(".")[:-1]) load_model_suffix = args.load_model_path.split(".")[-1] model = Classifier(args) model = load_model( model, load_model_name + "-fold_" + str(fold_id) + "." + load_model_suffix) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) model.eval() for i, (src_batch, seg_batch) in enumerate(batch_loader(batch_size, src, seg)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) with torch.no_grad(): _, logits = model(src_batch, None, seg_batch) prob = nn.Softmax(dim=1)(logits) prob = prob.cpu().numpy().tolist() test_features[fold_id].extend(prob) test_features = np.array(test_features) test_features = np.mean(test_features, axis=0) np.save(args.test_features_path, test_features)