def create_dataset(self, features, is_sorted=False): # Convert to Tensors and build dataset if is_sorted: logger.info("sorted data by th length of input") features = sorted(features, key=lambda x: x.input_len, reverse=True) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_trigger_mask = torch.tensor([f.trigger_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) all_input_lens = torch.tensor([f.input_len for f in features], dtype=torch.long) all_one_hot_labels = torch.tensor([f.one_hot_labels for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_trigger_mask, all_segment_ids, all_label_ids, all_input_lens, all_one_hot_labels) return dataset
def create_examples(self, lines, example_type, cached_file, save_cache): ''' Creates examples for data ''' label_list = self.get_labels() if cached_file and cached_file.exists(): logger.info("Loading examples from cached file %s", cached_file) examples = torch.load(cached_file) else: pbar = ProgressBar(n_total=len(lines), desc='create examples') examples = [] for i, line in enumerate(lines): #if i>20:break # for quik debug guid = '%s-%d' % (example_type, i) label = line['tags'] text_a = line['info'] text_b = None match = line["cira_match"] if self.test_mode == 4 and sum(match) < 4: continue else: examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, match=match)) pbar(step=i) if save_cache: logger.info("Saving examples into cached file %s", cached_file) torch.save(examples, cached_file) return examples
def read_type_data(cls, input_file, type): with jsonlines.open(input_file) as reader: lines = [] for line in reader: e_d = line["guid"].split("_")[1] # e_m = sum(line["cira_match"]) # datasets["%s_%d"%(e_d,e_m)].append(line) if e_d in type: lines.append(line) logger.info("type {} number = {}".format(type, len(lines))) return lines
def create_features(self, examples, max_seq_len, cached_file, save_cache=False): ''' # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 ''' max_lenth = 0 if cached_file and cached_file.exists(): logger.info("Loading features from cached file %s", cached_file) features = torch.load(cached_file) else: label_list = self.get_labels() label2id = {label: i for i, label in enumerate(label_list, 0)} pbar = ProgressBar(n_total=len(examples), desc='create features') features = [] for ex_id, example in enumerate(examples): # textlist = [] # for sentence in example.text_a: # textlist.extend(list(sentence)) textlist = list(example.text_a) if len(textlist) > max_lenth: max_lenth = len(textlist) tokens = self.tokenizer.tokenize(textlist) labels = example.label match = example.match if len(tokens) >= max_seq_len - 2: tokens = tokens[0:(max_seq_len - 2)] labels = labels[0:(max_seq_len - 2)] ntokens = [] segment_ids = [] label_ids = [] one_hot_labels = [] ntokens.append("[CLS]") segment_ids.append(0) mask_tags = [1] * len(label2id) mask_tags[label2id["[CLS]"]] = 0 one_hot_labels.append(mask_tags) label_ids.append(label2id["[CLS]"]) possible_tags = [1] * len(label2id) trigger_mask = [] trigger_mask.append(0) if sum(match) < 4: possible_tags[0] = 0 #assert match[0]==1 # if match[0] < 1: # possible_tags[1:3] = [0,0] if match[1] < 1: possible_tags[3:5] = [0, 0] #assert match[2]==1 # if match[2] < 1: # possible_tags[5:7] = [0, 0] if match[3] < 1: possible_tags[7:9] = [0, 0] for i, token in enumerate(tokens): ntokens.append(token) segment_ids.append(0) label_ids.append(label2id[labels[i]]) if "ROU" in labels[i]: trigger_mask.append(1) else: trigger_mask.append(0) if sum(match) < 4 and labels[i] == 'O' and ( token not in range(7993, 8029)) and (token not in range( 8039, 8051)): one_hot_labels.append(possible_tags) else: mask_tags = [1] * len(label2id) mask_tags[label2id[labels[i]]] = 0 one_hot_labels.append(mask_tags) ntokens.append("[SEP]") segment_ids.append(0) label_ids.append(label2id["[SEP]"]) mask_tags = [1] * len(label2id) mask_tags[label2id["[SEP]"]] = 0 one_hot_labels.append(mask_tags) trigger_mask.append(0) input_ids = self.tokenizer.convert_tokens_to_ids(ntokens) input_mask = [1] * len(input_ids) input_len = len(label_ids) while len(input_ids) < max_seq_len: input_ids.append(0) input_mask.append(0) segment_ids.append(0) label_ids.append(0) one_hot_labels.append([1] * len(label2id)) trigger_mask.append(0) assert len(input_ids) == max_seq_len assert len(input_mask) == max_seq_len assert len(segment_ids) == max_seq_len assert len(label_ids) == max_seq_len assert len(one_hot_labels) == max_seq_len assert len(one_hot_labels) == max_seq_len for i in range(len(one_hot_labels)): if len(one_hot_labels[i]) < 11: logger.info( "one-hot labels: pos:%d, %s" % (i, " ".join([str(x) for x in one_hot_labels[i]]))) # if ex_id < 2: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("tokens: %s" % " ".join([str(x) for x in tokens])) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) logger.info("label: %s id: %s" % (" ".join(example.label), " ".join( [str(x) for x in label_ids]))) features.append( InputFeature(input_ids=input_ids, input_mask=input_mask, trigger_mask=trigger_mask, segment_ids=segment_ids, label_id=label_ids, one_hot_labels=one_hot_labels, input_len=input_len)) pbar(step=ex_id) if save_cache: logger.info("Saving features into cached file %s", cached_file) torch.save(features, cached_file) logger.info("max_seq_lenth = {}".format(max_lenth)) return features
def run_train(args): processor = BertProcessor(vocab_path=os.path.join( args.pretrained_model, 'vocab.txt', ), test_mode=args.test_mode, do_lower_case=args.do_lower_case) #processor.tokenizer.save_vocabulary (str (args.model_path)) label_list = processor.get_labels() label2id = {label: i for i, label in enumerate(label_list)} train_cache_sample = config['data_dir'] / f"cached_train_seq_examples" train_cache_feature = config['data_dir'] / f"cached_train_seq_features" if args.type: train_data = processor.read_type_data(os.path.join( config['data_dir'], "train.jsonl"), type=args.type) valid_data = processor.read_type_data(os.path.join( config['data_dir'], "dev.jsonl"), type=args.type) train_cache_sample = config[ 'data_dir'] / f"cached_train_seq_examples_{args.type}" train_cache_feature = config[ 'data_dir'] / f"cached_train_seq_features_{args.type}" else: train_data = processor.read_data( os.path.join(config['data_dir'], "train.jsonl")) valid_data = processor.read_data( os.path.join(config['data_dir'], "dev.jsonl")) if args.early_stop: early_stopping = EarlyStopping(patience=3, monitor="f1", baseline=0, mode='max') else: early_stopping = None train_dataset = convert_data_to_tensor( processor=processor, args=args, data=train_data, type="train", cache_sample_path=train_cache_sample, cache_feature_path=train_cache_feature, save_cache=False) if args.sorted: train_sampler = SequentialSampler(train_dataset) else: train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) valid_dataset = convert_data_to_tensor( processor=processor, args=args, data=valid_data, type="dev", cache_sample_path=config['data_dir'] / f"cached_dev_seq_examples", cache_feature_path=config['data_dir'] / f"cached_dev_seq_features", save_cache=False) valid_sampler = SequentialSampler(valid_dataset) valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.eval_batch_size) # model = BERTCRF # # bert_config = BertConfig.from_json_file(os.path.join(args.pretrained_model,"config.json")) # bert_config.num_hidden_layers = args.depth # if args.resume_path: # args.resume_path = Path (args.resume_path) # model = model.from_pretrained (args.resume_path, label2id=label2id, device=args.device,config=bert_config) # # else: # model = model.from_pretrained (args.pretrained_model, label2id=label2id, device=args.device,config=bert_config) bert_config = BertConfig.from_json_file( os.path.join(args.pretrained_model, "config.json")) model = CNNLSTMCRF(config=bert_config, label2id=label2id, device=args.device) ckpt = torch.load(os.path.join(args.pretrained_model, "pytorch_model.bin")) if "state_dict" in ckpt: state_dict = ckpt["state_dict"] else: state_dict = ckpt for key in list(state_dict.keys()): if 'embedding' in key: new_key = key.replace("bert.embeddings.", "") # delete 'bert.' state_dict[new_key] = state_dict.pop(key) try: model.BERTEm.load_state_dict(state_dict, strict=True) except Exception as e: print(e) model = model.to(args.device) t_total = int( len(train_dataloader) / args.gradient_accumulation_steps * args.epochs) optimizer = RMSprop(model.parameters(), lr=args.learning_rate) lr_scheduler = BERTReduceLROnPlateau(optimizer, lr=args.learning_rate, mode=args.mode, factor=0.5, patience=1, verbose=1, epsilon=1e-8, cooldown=0, min_lr=0, eps=1e-8) model_checkpoint = ModelCheckpoint(checkpoint_dir=args.model_path, mode=args.mode, monitor=args.monitor, arch=args.arch, save_best_only=args.save_best) # **************************** training model *********************** logger.info("***** Running training *****") logger.info(" Num Epochs = %d", args.epochs) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) tb_logger = Tensorboard_Logger( log_dir=os.path.join(args.model_path, config['output'])) trainer = Trainer( n_gpu=args.n_gpu, model=model, logger=logger, tb_logger=tb_logger, optimizer=optimizer, lr_scheduler=lr_scheduler, label2id=label2id, grad_clip=args.grad_clip, model_checkpoint=model_checkpoint, gradient_accumulation_steps=args.gradient_accumulation_steps, early_stopping=early_stopping, partial=args.partial, trigger=args.trigger) trainer.train(train_data=train_dataloader, valid_data=valid_dataloader, epochs=args.epochs, seed=args.seed)
def main(): parser = ArgumentParser() parser.add_argument("--arch", default='bert_crf', type=str) parser.add_argument("--type", default='', type=str) parser.add_argument("--do_train", action='store_true') parser.add_argument("--do_test", action='store_true') parser.add_argument("--do_predict", action='store_true') parser.add_argument("--save_best", action='store_true') parser.add_argument("--do_lower_case", action='store_true') parser.add_argument("--early_stop", action='store_true') parser.add_argument('--data_name', default='datagrand', type=str) parser.add_argument('--optimizer', default='adam', type=str, choices=['adam', 'lookahead']) parser.add_argument('--markup', default='bios', type=str, choices=['bio', 'bios']) parser.add_argument('--checkpoint', default=900000, type=int) parser.add_argument("--epochs", default=30, type=int) parser.add_argument('--fold', default=0, type=int) # --resume_path = src/output/checkpoints/bert_lstm_crf_bios_fold_0/checkpoint-epoch-30' parser.add_argument("--resume_path", default='', type=str) parser.add_argument("--mode", default='max', type=str) parser.add_argument("--monitor", default='f1', type=str) parser.add_argument("--local_rank", type=int, default=-1) parser.add_argument("--sorted", default=1, type=int, help='1:True 0:False ') parser.add_argument("--n_gpu", type=str, default='0', help='"0,1,.." or "0" or "" ') parser.add_argument('--gradient_accumulation_steps', type=int, default=2) parser.add_argument("--train_batch_size", default=16, type=int) parser.add_argument('--eval_batch_size', default=64, type=int) parser.add_argument("--train_max_seq_len", default=256, type=int) parser.add_argument("--eval_max_seq_len", default=256, type=int) parser.add_argument('--loss_scale', type=float, default=0) parser.add_argument("--warmup_proportion", default=0.05, type=float) parser.add_argument("--weight_decay", default=0.01, type=float) parser.add_argument("--adam_epsilon", default=1e-8, type=float) parser.add_argument("--grad_clip", default=5.0, type=float) parser.add_argument("--learning_rate", default=1e-4, type=float) parser.add_argument('--seed', type=int, default=42) parser.add_argument("--no_cuda", action='store_true') parser.add_argument("--partial", action='store_true') parser.add_argument("--trigger", action='store_true') parser.add_argument("--test_mode", type=int, default=0) parser.add_argument("--pretrained_model", type=str, default="pretrained_model") parser.add_argument("--depth", type=int) args = parser.parse_args() args.device = torch.device( f"cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") if args.type: args.arch += f"_{args.type}" # name_str = "_bs-{}_lr-{}_len-{}".format(args.train_batch_size,args.learning_rate,args.train_max_seq_len) args.model_path = config['output'] / args.arch args.model_path.mkdir(exist_ok=True) # Good practice: save your training arguments together with the trained model torch.save(args, args.model_path / 'training_args.bin') seed_everything(args.seed) init_logger(log_file=args.model_path / f"{args.arch}.log") logger.info("Training/evaluation parameters %s", args) if args.do_train: run_train(args) if args.do_test: run_test(args)
def run_test(args): processor = BertProcessor(os.path.join(args.pretrained_model, 'vocab.txt'), args.do_lower_case, test_mode=args.test_mode) label_list = processor.get_labels() label2id = {label: i for i, label in enumerate(label_list)} # id2label = {i: label for i, label in enumerate (label_list)} bert_config = BertConfig.from_json_file( os.path.join(args.pretrained_model, "config.json")) bert_config.num_hidden_layers = args.depth model = CNNLSTMCRF(config=bert_config, label2id=label2id, device=args.device) ckpt = torch.load(os.path.join(args.resume_path, "pytorch_model.bin")) if "state_dict" in ckpt: state_dict = ckpt["state_dict"] else: state_dict = ckpt try: model.load_state_dict(state_dict, strict=True) except Exception as e: print(e) model = model.to(args.device) trainer = Trainer( n_gpu=args.n_gpu, model=model, logger=logger, tb_logger=None, optimizer=None, lr_scheduler=None, label2id=label2id, grad_clip=args.grad_clip, model_checkpoint=None, gradient_accumulation_steps=args.gradient_accumulation_steps, partial=args.partial, trigger=args.trigger) split = True if split: diff = ["e", "m", "h"] results = {} for d in diff: test_data = processor.read_type_data(os.path.join( config['data_dir'], "test_gold_all.jsonl"), type=d) test_dataset = convert_data_to_tensor(processor=processor, args=args, data=test_data, type=d, cache_sample_path=None, cache_feature_path=None, save_cache=False) test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.eval_batch_size) info, ex_info, class_info = trainer.valid_epoch(test_dataloader) results[d] = [class_info, ex_info, info] res = json.dumps(results) fileObject = open(os.path.join(args.model_path, 'result.json'), 'w') fileObject.write(res) fileObject.close() prf = ["precision", "recall", "f1"] ex_prf = ["ex_p", "ex_r", "ex_f1"] types = ["COM", "INV", "ROU", "AMO"] logger.info("Eval results:") for d in diff: values = [] class_info, ex_info, info = results[d] for t in types: cv = class_info[t] for k in prf: values.append("{:.4f}".format(cv[k])) for k in prf: values.append("{:.4f}".format(info[k])) for k in ex_prf: values.append("{:.4f}".format(ex_info[k])) show_info = f'diff:{d},' + ",".join(values) logger.info(show_info)
def run_train(args): processor = BertProcessor(vocab_path=os.path.join( args.pretrained_model, 'vocab.txt', ), test_mode=args.test_mode, do_lower_case=args.do_lower_case) #processor.tokenizer.save_vocabulary (str (args.model_path)) label_list = processor.get_labels() label2id = {label: i for i, label in enumerate(label_list)} train_cache_sample = config['data_dir'] / f"cached_train_seq_examples" train_cache_feature = config['data_dir'] / f"cached_train_seq_features" if args.type: train_data = processor.read_type_data(os.path.join( config['data_dir'], "train.jsonl"), type=args.type) valid_data = processor.read_type_data(os.path.join( config['data_dir'], "dev.jsonl"), type=args.type) train_cache_sample = config[ 'data_dir'] / f"cached_train_seq_examples_{args.type}" train_cache_feature = config[ 'data_dir'] / f"cached_train_seq_features_{args.type}" else: train_data = processor.read_data( os.path.join(config['data_dir'], "train.jsonl")) valid_data = processor.read_data( os.path.join(config['data_dir'], "dev.jsonl")) if args.early_stop: early_stopping = EarlyStopping(patience=3, monitor="f1", baseline=0, mode='max') else: early_stopping = None train_dataset = convert_data_to_tensor( processor=processor, args=args, data=train_data, type="train", cache_sample_path=train_cache_sample, cache_feature_path=train_cache_feature, save_cache=False) if args.sorted: train_sampler = SequentialSampler(train_dataset) else: train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) valid_dataset = convert_data_to_tensor( processor=processor, args=args, data=valid_data, type="dev", cache_sample_path=config['data_dir'] / f"cached_dev_seq_examples", cache_feature_path=config['data_dir'] / f"cached_dev_seq_features", save_cache=False) valid_sampler = SequentialSampler(valid_dataset) valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.eval_batch_size) model = BERTCRF bert_config = BertConfig.from_json_file( os.path.join(args.pretrained_model, "config.json")) bert_config.num_hidden_layers = args.depth if args.resume_path: args.resume_path = Path(args.resume_path) model = model.from_pretrained(args.resume_path, label2id=label2id, device=args.device, config=bert_config) else: model = model.from_pretrained(args.pretrained_model, label2id=label2id, device=args.device, config=bert_config) model = model.to(args.device) t_total = int( len(train_dataloader) / args.gradient_accumulation_steps * args.epochs) bert_param_optimizer = list(model.bert.named_parameters()) crf_param_optimizer = list(model.crf.named_parameters()) linear_param_optimizer = list(model.classifier.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in bert_param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01, 'lr': args.learning_rate }, { 'params': [ p for n, p in bert_param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': args.learning_rate }, { 'params': [ p for n, p in crf_param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01, 'lr': 0.001 }, { 'params': [p for n, p in crf_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, 'lr': 0.001 }, { 'params': [ p for n, p in linear_param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01, 'lr': 0.001 }, { 'params': [ p for n, p in linear_param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': 0.001 }] if args.optimizer == 'adam': optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) else: raise ValueError("unknown optimizer") lr_scheduler = BERTReduceLROnPlateau(optimizer, lr=args.learning_rate, mode=args.mode, factor=0.5, patience=1, verbose=1, epsilon=1e-8, cooldown=0, min_lr=0, eps=1e-8) model_checkpoint = ModelCheckpoint(checkpoint_dir=args.model_path, mode=args.mode, monitor=args.monitor, arch=args.arch, save_best_only=args.save_best) # **************************** training model *********************** logger.info("***** Running training *****") logger.info(" Num Epochs = %d", args.epochs) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) tb_logger = Tensorboard_Logger( log_dir=os.path.join(args.model_path, config['output'])) trainer = Trainer( n_gpu=args.n_gpu, model=model, logger=logger, tb_logger=tb_logger, optimizer=optimizer, lr_scheduler=lr_scheduler, label2id=label2id, grad_clip=args.grad_clip, model_checkpoint=model_checkpoint, gradient_accumulation_steps=args.gradient_accumulation_steps, early_stopping=early_stopping, partial=args.partial, trigger=args.trigger) trainer.train(train_data=train_dataloader, valid_data=valid_dataloader, epochs=args.epochs, seed=args.seed)