def main(): parser = argparse.ArgumentParser() parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--from_pretrained", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--output_dir", default="save", type=str, help= "The output directory where the model checkpoints will be written.", ) parser.add_argument( "--config_file", default="config/bert_base_6layer_6conect.json", type=str, help="The config file which specified the model details.", ) parser.add_argument( "--num_train_epochs", default=20, type=int, help="Total number of training epochs to perform.", ) parser.add_argument( "--train_iter_multiplier", default=1.0, type=float, help="multiplier for the multi-task training.", ) parser.add_argument( "--train_iter_gap", default=4, type=int, help= "forward every n iteration is the validation score is not improving over the last 3 epoch, -1 means will stop", ) parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--do_lower_case", default=True, type=bool, help= "Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus", ) parser.add_argument("--seed", type=int, default=0, help="random seed for initialization") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass.", ) parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument( "--num_workers", type=int, default=16, help="Number of workers in the dataloader.", ) parser.add_argument("--save_name", default="", type=str, help="save name for training.") parser.add_argument( "--in_memory", default=False, type=bool, help="whether use chunck for parallel training.", ) parser.add_argument("--optim", default="AdamW", type=str, help="what to use for the optimization.") parser.add_argument("--tasks", default="", type=str, help="1-2-3... training task separate by -") parser.add_argument( "--freeze", default=-1, type=int, help="till which layer of textual stream of vilbert need to fixed.", ) parser.add_argument( "--vision_scratch", action="store_true", help="whether pre-trained the image or not.", ) parser.add_argument("--evaluation_interval", default=1, type=int, help="evaluate very n epoch.") parser.add_argument( "--lr_scheduler", default="mannul", type=str, help="whether use learning rate scheduler.", ) parser.add_argument("--baseline", action="store_true", help="whether use single stream baseline.") parser.add_argument("--resume_file", default="", type=str, help="Resume from checkpoint") parser.add_argument( "--dynamic_attention", action="store_true", help="whether use dynamic attention.", ) parser.add_argument( "--clean_train_sets", default=True, type=bool, help="whether clean train sets for multitask data.", ) parser.add_argument( "--visual_target", default=0, type=int, help="which target to use for visual branch. \ 0: soft label, \ 1: regress the feature, \ 2: NCE loss.", ) parser.add_argument( "--task_specific_tokens", action="store_true", help="whether to use task specific tokens for the multi-task learning.", ) args = parser.parse_args() with open("vilbert_tasks.yml", "r") as f: task_cfg = edict(yaml.safe_load(f)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.baseline: from pytorch_transformers.modeling_bert import BertConfig from vilbert.basebert import BaseBertForVLTasks else: from vilbert.vilbert import BertConfig from vilbert.vilbert import VILBertForVLTasks task_names = [] task_lr = [] for i, task_id in enumerate(args.tasks.split("-")): task = "TASK" + task_id name = task_cfg[task]["name"] task_names.append(name) task_lr.append(task_cfg[task]["lr"]) base_lr = min(task_lr) loss_scale = {} for i, task_id in enumerate(args.tasks.split("-")): task = "TASK" + task_id loss_scale[task] = task_lr[i] / base_lr if args.save_name: prefix = "-" + args.save_name else: prefix = "" timeStamp = ("-".join(task_names) + "_" + args.config_file.split("/")[1].split(".")[0] + prefix) savePath = os.path.join(args.output_dir, timeStamp) bert_weight_name = json.load( open("config/" + args.bert_model + "_weight_name.json", "r")) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True if default_gpu: if not os.path.exists(savePath): os.makedirs(savePath) config = BertConfig.from_json_file(args.config_file) if default_gpu: # save all the hidden parameters. with open(os.path.join(savePath, "command.txt"), "w") as f: print(args, file=f) # Python 3.x print("\n", file=f) print(config, file=f) task_batch_size, task_num_iters, task_ids, task_datasets_train, task_datasets_val, task_dataloader_train, task_dataloader_val = LoadDatasets( args, task_cfg, args.tasks.split("-")) logdir = os.path.join(savePath, "logs") tbLogger = utils.tbLogger( logdir, savePath, task_names, task_ids, task_num_iters, args.gradient_accumulation_steps, ) if args.visual_target == 0: config.v_target_size = 1601 config.visual_target = args.visual_target else: config.v_target_size = 2048 config.visual_target = args.visual_target if args.task_specific_tokens: config.task_specific_tokens = True if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_ave_iter = {} task_stop_controller = {} for task_id, num_iter in task_num_iters.items(): task_ave_iter[task_id] = int(task_cfg[task]["num_epoch"] * num_iter * args.train_iter_multiplier / args.num_train_epochs) task_stop_controller[task_id] = utils.MultiTaskStopOnPlateau( mode="max", patience=1, continue_threshold=0.005, cooldown=1, threshold=0.001, ) task_ave_iter_list = sorted(task_ave_iter.values()) median_num_iter = task_ave_iter_list[-1] num_train_optimization_steps = (median_num_iter * args.num_train_epochs // args.gradient_accumulation_steps) num_labels = max( [dataset.num_labels for dataset in task_datasets_train.values()]) if args.dynamic_attention: config.dynamic_attention = True if "roberta" in args.bert_model: config.model = "roberta" if args.baseline: model = BaseBertForVLTasks.from_pretrained( args.from_pretrained, config=config, num_labels=num_labels, default_gpu=default_gpu, ) else: model = VILBertForVLTasks.from_pretrained( args.from_pretrained, config=config, num_labels=num_labels, default_gpu=default_gpu, ) task_losses = LoadLosses(args, task_cfg, args.tasks.split("-")) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] if args.freeze != -1: bert_weight_name_filtered = [] for name in bert_weight_name: if "embeddings" in name: bert_weight_name_filtered.append(name) elif "encoder" in name: layer_num = name.split(".")[2] if int(layer_num) <= args.freeze: bert_weight_name_filtered.append(name) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if key[12:] in bert_weight_name_filtered: value.requires_grad = False if default_gpu: print("filtered weight") print(bert_weight_name_filtered) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if "vil_" in key: lr = 1e-4 else: if args.vision_scratch: if key[12:] in bert_weight_name: lr = base_lr else: lr = 1e-4 else: lr = base_lr if any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.0 }] if not any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.01 }] if default_gpu: print(len(list(model.named_parameters())), len(optimizer_grouped_parameters)) if args.optim == "AdamW": optimizer = AdamW(optimizer_grouped_parameters, lr=base_lr, correct_bias=False) elif args.optim == "RAdam": optimizer = RAdam(optimizer_grouped_parameters, lr=base_lr) warmpu_steps = args.warmup_proportion * num_train_optimization_steps if args.lr_scheduler == "warmup_linear": warmup_scheduler = WarmupLinearSchedule( optimizer, warmup_steps=warmpu_steps, t_total=num_train_optimization_steps) else: warmup_scheduler = WarmupConstantSchedule(optimizer, warmup_steps=warmpu_steps) lr_reduce_list = np.array([5, 7]) if args.lr_scheduler == "automatic": lr_scheduler = ReduceLROnPlateau(optimizer, mode="max", factor=0.2, patience=1, cooldown=1, threshold=0.001) elif args.lr_scheduler == "cosine": lr_scheduler = CosineAnnealingLR(optimizer, T_max=median_num_iter * args.num_train_epochs) elif args.lr_scheduler == "cosine_warm": lr_scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=median_num_iter * args.num_train_epochs) elif args.lr_scheduler == "mannul": def lr_lambda_fun(epoch): return pow(0.2, np.sum(lr_reduce_list <= epoch)) lr_scheduler = LambdaLR(optimizer, lr_lambda=lr_lambda_fun) startIterID = 0 global_step = 0 start_epoch = 0 if args.resume_file != "" and os.path.exists(args.resume_file): checkpoint = torch.load(args.resume_file, map_location="cpu") new_dict = {} for attr in checkpoint["model_state_dict"]: if attr.startswith("module."): new_dict[attr.replace( "module.", "", 1)] = checkpoint["model_state_dict"][attr] else: new_dict[attr] = checkpoint["model_state_dict"][attr] model.load_state_dict(new_dict) warmup_scheduler.load_state_dict( checkpoint["warmup_scheduler_state_dict"]) # lr_scheduler.load_state_dict(checkpoint['lr_scheduler_state_dict']) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) global_step = checkpoint["global_step"] start_epoch = int(checkpoint["epoch_id"]) + 1 task_stop_controller = checkpoint["task_stop_controller"] tbLogger = checkpoint["tb_logger"] del checkpoint model.to(device) for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, delay_allreduce=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) if default_gpu: print("***** Running training *****") print(" Num Iters: ", task_num_iters) print(" Batch size: ", task_batch_size) print(" Num steps: %d" % num_train_optimization_steps) task_iter_train = {name: None for name in task_ids} task_count = {name: 0 for name in task_ids} for epochId in tqdm(range(start_epoch, args.num_train_epochs), desc="Epoch"): model.train() for step in range(median_num_iter): iterId = startIterID + step + (epochId * median_num_iter) first_task = True for task_id in task_ids: is_forward = False if (not task_stop_controller[task_id].in_stop) or ( iterId % args.train_iter_gap == 0): is_forward = True if is_forward: loss, score = ForwardModelsTrain( args, task_cfg, device, task_id, task_count, task_iter_train, task_dataloader_train, model, task_losses, ) loss = loss * loss_scale[task_id] if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion, ) for param_group in optimizer.param_groups: param_group["lr"] = lr_this_step if first_task and (global_step < warmpu_steps or args.lr_scheduler == "warmup_linear"): warmup_scheduler.step() optimizer.step() model.zero_grad() if first_task: global_step += 1 first_task = False if default_gpu: tbLogger.step_train( epochId, iterId, float(loss), float(score), optimizer.param_groups[0]["lr"], task_id, "train", ) if "cosine" in args.lr_scheduler and global_step > warmpu_steps: lr_scheduler.step() if (step % (20 * args.gradient_accumulation_steps) == 0 and step != 0 and default_gpu): tbLogger.showLossTrain() # decided whether to evaluate on each tasks. for task_id in task_ids: if (iterId != 0 and iterId % task_num_iters[task_id] == 0) or (epochId == args.num_train_epochs - 1 and step == median_num_iter - 1): evaluate( args, task_dataloader_val, task_stop_controller, task_cfg, device, task_id, model, task_losses, epochId, default_gpu, tbLogger, ) if args.lr_scheduler == "automatic": lr_scheduler.step(sum(val_scores.values())) logger.info("best average score is %3f" % lr_scheduler.best) elif args.lr_scheduler == "mannul": lr_scheduler.step() if epochId in lr_reduce_list: for task_id in task_ids: # reset the task_stop_controller once the lr drop task_stop_controller[task_id]._reset() if default_gpu: # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = ( model.module if hasattr(model, "module") else model ) # Only save the model it-self output_model_file = os.path.join( savePath, "pytorch_model_" + str(epochId) + ".bin") output_checkpoint = os.path.join(savePath, "pytorch_ckpt_latest.tar") torch.save(model_to_save.state_dict(), output_model_file) torch.save( { "model_state_dict": model_to_save.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "warmup_scheduler_state_dict": warmup_scheduler.state_dict(), # 'lr_scheduler_state_dict': lr_scheduler.state_dict(), "global_step": global_step, "epoch_id": epochId, "task_stop_controller": task_stop_controller, "tb_logger": tbLogger, }, output_checkpoint, ) tbLogger.txt_close()
aspect = self.aspect_bert(text_ids, attention_mask=text_att_mask)[0] if self.tp == 'cdm': mask = torch.zeros_like(text_ids).float() # B L for i, (s, e) in enumerate(pos): s = max(0, s.item() - self.SDR) e = min(e.item() + self.SDR + 1, len(aspect)) mask[i, s:e] = torch.tensor([1.0] * (e - s)) aspect = aspect * mask.unsqueeze(-1) cat = torch.cat([text_asp, aspect], -1) # B L 2H cat = self.reduce2_bert_dim(cat) x = self.aspect_self_att(cat) # B L H x = self.bert_pooler(cat) # B H out = self.reduce2_num_class_linear(x) # , 'aspect_emphasize_att_acore_BhL': aspect_emphasize_att_score.squeeze().detach().cpu().numpy()} return {'output': out} if __name__ == "__main__": conf = BertConfig.from_pretrained('/mnt/sda1/bert/uncased_L-12_H-768_A-12') m = MY_BERT_LCF(conf) input_ids = torch.ones((16, 80)).long() attention_mask = torch.ones_like(input_ids) text_asp_ids, text_asp_att_mask = copy.deepcopy(input_ids), copy.deepcopy( attention_mask) pos = torch.LongTensor([[1, 4] for _ in range(16)]) print( m.forward(text_asp_ids, text_asp_att_mask, input_ids, attention_mask, pos)['output'].size())
parser = argparse.ArgumentParser(description="inference") parser.add_argument("opts", default=None, nargs=argparse.REMAINDER) args = parser.parse_args() config.merge_from_list(args.opts) config.freeze() save_dir = os.path.join(config.save_dir) # mkdir(save_dir) logger = setup_logger("inference", save_dir, 0) logger.info("Running with config:\n{}".format(config)) device = torch.device(config.device) num_types = 2 generator = Generator(BertConfig()) checkpoint = torch.load(os.path.join(config.model_path, 'best_model.pth')) generator.load_state_dict(checkpoint['state_dict']) generator = generator.to(device) # g_checkpointer = Checkpointer(model=generator, logger=logger) # g_checkpointer.load(config.model_path, True) dataset = COCOCaptionGeneratorDataset(root=config.root_dir, split='test') # data_loader = make_data_loader( # dataset=dataset, # batch_size=8, # num_workers=0, # split='test' # )
def main(): # os.environ['CUDA_VISIBLE_DEVICES'] = "0,1" batch_size = 64 parser = argparse.ArgumentParser() parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--from_pretrained", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--output_dir", default="save", type=str, help= "The output directory where the model checkpoints will be written.", ) parser.add_argument( "--config_file", default="config/bert_base_6layer_6conect.json", type=str, help="The config file which specified the model details.", ) parser.add_argument( "--num_train_epochs", default=20, type=int, help="Total number of training epochs to perform.", ) parser.add_argument( "--train_iter_multiplier", default=1.0, type=float, help="multiplier for the multi-task training.", ) parser.add_argument( "--train_iter_gap", default=4, type=int, help= "forward every n iteration is the validation score is not improving over the last 3 epoch, -1 means will stop", ) parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--do_lower_case", default=True, type=bool, help= "Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus", ) parser.add_argument("--seed", type=int, default=0, help="random seed for initialization") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass.", ) parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument( "--num_workers", type=int, default=16, help="Number of workers in the dataloader.", ) parser.add_argument("--save_name", default="", type=str, help="save name for training.") parser.add_argument( "--in_memory", default=False, type=bool, help="whether use chunck for parallel training.", ) parser.add_argument("--optim", default="AdamW", type=str, help="what to use for the optimization.") parser.add_argument("--tasks", default="0", type=str, help="discourse : TASK0") parser.add_argument( "--freeze", default=-1, type=int, help="till which layer of textual stream of vilbert need to fixed.", ) parser.add_argument( "--vision_scratch", action="store_true", help="whether pre-trained the image or not.", ) parser.add_argument("--evaluation_interval", default=1, type=int, help="evaluate very n epoch.") parser.add_argument( "--lr_scheduler", default="mannul", type=str, help="whether use learning rate scheduler.", ) parser.add_argument("--baseline", action="store_true", help="whether use single stream baseline.") parser.add_argument("--resume_file", default="", type=str, help="Resume from checkpoint") parser.add_argument( "--dynamic_attention", action="store_true", help="whether use dynamic attention.", ) parser.add_argument( "--clean_train_sets", default=True, type=bool, help="whether clean train sets for multitask data.", ) parser.add_argument( "--visual_target", default=0, type=int, help="which target to use for visual branch. \ 0: soft label, \ 1: regress the feature, \ 2: NCE loss.", ) parser.add_argument( "--task_specific_tokens", action="store_true", default=False, help="whether to use task specific tokens for the multi-task learning.", ) # todo args = parser.parse_args() with open("vilbert_tasks.yml", "r") as f: task_cfg = edict(yaml.safe_load(f)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False if args.baseline: from pytorch_transformers.modeling_bert import BertConfig from vilbert.basebert import BaseBertForVLTasks else: from vilbert.vilbert import BertConfig from vilbert.vilbert import VILBertForVLTasks task_names = [] task_lr = [] task_id = 1 for i, task_id in enumerate(args.tasks.split("-")): task_id = str(1) task = "TASK" + task_id name = task_cfg[task]["name"] task_names.append(name) task_lr.append(task_cfg[task]["lr"]) base_lr = min(task_lr) loss_scale = {} for i, task_id in enumerate(args.tasks.split("-")): task = "TASK" + task_id loss_scale[task] = task_lr[i] / base_lr if args.save_name: prefix = "-" + args.save_name else: prefix = "" timeStamp = ("-".join("discourse") + "_" + args.config_file.split("/")[1].split(".")[0] + prefix) savePath = os.path.join(args.output_dir, timeStamp) bert_weight_name = json.load( open("config/" + args.bert_model + "_weight_name.json", "r")) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 3 torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True if default_gpu: if not os.path.exists(savePath): os.makedirs(savePath) config = BertConfig.from_json_file(args.config_file) if default_gpu: # save all the hidden parameters. with open(os.path.join(savePath, "command.txt"), "w") as f: print(args, file=f) # Python 3.x print("\n", file=f) print(config, file=f) # task_batch_size, task_num_iters, task_ids, task_datasets_train, task_datasets_val, task_dataloader_train, task_dataloader_val = LoadDatasets( # args, task_cfg, args.tasks.split("-"),'train' # ) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) labels = [ "Visible", 'Subjective', 'Action', 'Story', 'Meta', 'Irrelevant', 'Other' ] train_dataset = DiscourseRelationDataset( labels, task_cfg[task]["dataroot"], tokenizer, args.bert_model, task_cfg[task]["max_seq_length"], encoding="utf-8", visual_target=0, batch_size=batch_size, shuffle=False, num_workers=4, cache=5000, drop_last=False, cuda=False, objective=0, visualization=False, ) train_sampler = RandomSampler(train_dataset) train_loader = DataLoader( train_dataset, sampler=train_sampler, batch_size=batch_size, num_workers=0, pin_memory=True, ) # for i in train_loader: # print("hello") # todo task_ids , task_num_tiers task_ids = ['TASK0'] task_num_iters = [100] task_batch_size = task_cfg['TASK0']["batch_size"] print("task_batch_size") print(task_batch_size) logdir = os.path.join(savePath, "logs") tbLogger = utils.tbLogger( logdir, savePath, task_names, task_ids, task_num_iters, args.gradient_accumulation_steps, ) if args.visual_target == 0: config.v_target_size = 1601 config.visual_target = args.visual_target else: config.v_target_size = 2048 config.visual_target = args.visual_target if args.task_specific_tokens: print("*********** config.task_specific_tokens = True ************") config.task_specific_tokens = True if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) num_train_optimization_steps = 10 num_labels = len(labels) if args.dynamic_attention: config.dynamic_attention = True if "roberta" in args.bert_model: config.model = "roberta" if args.baseline: model = BaseBertForVLTasks.from_pretrained( args.from_pretrained, config=config, num_labels=num_labels, default_gpu=default_gpu, ) else: model = VILBertForVLTasks.from_pretrained( args.from_pretrained, config=config, num_labels=num_labels, default_gpu=default_gpu, ) model.double() model = model.to(device) task_losses = LoadLosses(args, task_cfg, args.tasks.split("-")) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] if args.freeze != -1: bert_weight_name_filtered = [] for name in bert_weight_name: if "embeddings" in name: bert_weight_name_filtered.append(name) elif "encoder" in name: layer_num = name.split(".")[2] if int(layer_num) <= args.freeze: bert_weight_name_filtered.append(name) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if key[12:] in bert_weight_name_filtered: value.requires_grad = False if default_gpu: print("filtered weight") print(bert_weight_name_filtered) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if "vil_" in key: lr = 1e-4 else: if args.vision_scratch: if key[12:] in bert_weight_name: lr = base_lr else: lr = 1e-4 else: lr = base_lr if any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.0 }] if not any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.01 }] if default_gpu: print(len(list(model.named_parameters())), len(optimizer_grouped_parameters)) if args.optim == "AdamW": optimizer = AdamW(optimizer_grouped_parameters, lr=base_lr, correct_bias=False, weight_decay=1e-4) elif args.optim == "RAdam": optimizer = RAdam(optimizer_grouped_parameters, lr=base_lr, weight_decay=1e-4) startIterID = 0 global_step = 0 start_epoch = 0 if args.resume_file != "" and os.path.exists(args.resume_file): checkpoint = torch.load(args.resume_file, map_location="cpu") new_dict = {} for attr in checkpoint["model_state_dict"]: if attr.startswith("module."): new_dict[attr.replace( "module.", "", 1)] = checkpoint["model_state_dict"][attr] else: new_dict[attr] = checkpoint["model_state_dict"][attr] model.load_state_dict(new_dict) # warmup_scheduler.load_state_dict(checkpoint["warmup_scheduler_state_dict"]) # lr_scheduler.load_state_dict(checkpoint['lr_scheduler_state_dict']) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) global_step = checkpoint["global_step"] start_epoch = int(checkpoint["epoch_id"]) + 1 task_stop_controller = checkpoint["task_stop_controller"] tbLogger = checkpoint["tb_logger"] del checkpoint model.to(device) for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, delay_allreduce=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) if default_gpu: print("***** Running training *****") print(" Num Iters: ", task_num_iters) print(" Batch size: ", batch_size) print(" Num steps: %d" % num_train_optimization_steps) target_path = os.path.join(task_cfg[task]["dataroot"], "all_targets_json.json") all_targets = json.load(open(target_path, "r")) model = model.to(device) print(next(model.parameters()).is_cuda) kmeans = MiniBatchKMeans(n_clusters=num_labels, random_state=0, batch_size=batch_size) with torch.no_grad(): model.eval() print("********* training *************") for step, batch in enumerate(train_loader): print("########## STEP @ {} ##########".format(step)) batch = tuple( t.to(device=device, non_blocking=True) if type(t) == torch.Tensor else t for t in batch) input_ids, input_mask, segment_ids, image_feat, image_loc, image_mask, image_id = ( batch) print(input_ids.shape) print(len(train_loader)) true_targets = [] for id in image_id: true_targets.append( np.fromiter(all_targets[id].values(), dtype=np.double)) true_targets = torch.from_numpy(np.array(true_targets)) true_targets = true_targets.to(device) model.double() model = model.to(device) pooled_output, discourse_prediction, vil_prediction, vil_prediction_gqa, vil_logit, vil_binary_prediction, vil_tri_prediction, vision_prediction, vision_logit, linguisic_prediction, linguisic_logit, _ \ = model( True, input_ids, image_feat, image_loc, segment_ids, input_mask, image_mask, ) kmeans.partial_fit(pooled_output.to('cpu')) evaluate(model, kmeans, device, task_cfg, tokenizer, args, labels)
def __init__(self, config): super().__init__(config) self.mmt_config = BertConfig(**self.config.mmt) self._datasets = registry.get("config").datasets.split(",")
from flask import Flask, jsonify, request, render_template from model import BertForEmotionClassification from datasets import get_pretrained_model, Datasets from pytorch_transformers.modeling_bert import BertConfig import numpy as np import torch pretrained_model_path = './best_model/best_model.bin' config_path = './best_model/bert_config.json' pretrained = torch.load(pretrained_model_path) bert_config = BertConfig(config_path) bert_config.num_labels = 7 model = BertForEmotionClassification(bert_config) model.load_state_dict(pretrained, strict=False) model.eval() softmax = torch.nn.Softmax(dim=1) tokenizer, vocab = get_pretrained_model('etri') label_list = np.array(['공포', '놀람', '분노', '슬픔', '중립', '행복', '혐오']) def get_prediction(sentence): sentence = Datasets.normalize_string(sentence) sentence = tokenizer.tokenize(sentence) sentence = tokenizer.convert_tokens_to_ids(sentence) sentence = [vocab['[CLS]']] + sentence + [vocab['[SEP]']] output = model(torch.tensor(sentence).unsqueeze(0))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--meta_path", default=None, type=str, required=False, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( '--classifier', default='guoday', type=str, required=True, help='classifier type, guoday or MLP or GRU_MLP or ...') parser.add_argument('--optimizer', default='RAdam', type=str, required=True, help='optimizer we use, RAdam or ...') parser.add_argument("--do_label_smoothing", default='yes', type=str, required=True, help="Whether to do label smoothing. yes or no.") ## Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", default='yes', type=str, required=True, help="Whether to run training. yes or no.") parser.add_argument("--do_test", default='yes', type=str, required=True, help="Whether to run training. yes or no.") parser.add_argument("--do_eval", default='yes', type=str, required=True, help="Whether to run eval on the dev set. yes or no.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--eval_steps", default=200, type=int, help="") parser.add_argument("--lstm_hidden_size", default=300, type=int, help="") parser.add_argument("--lstm_layers", default=2, type=int, help="") parser.add_argument("--dropout", default=0.5, type=float, help="") parser.add_argument("--train_steps", default=-1, type=int, help="") parser.add_argument("--report_steps", default=-1, type=int, help="") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--split_num", default=3, type=int, help="text split") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") args = parser.parse_args() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) try: os.makedirs(args.output_dir) except: pass tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case) # tensorboard_log_dir = args.output_dir # loss_now = tf.placeholder(dtype=tf.float32, name='loss_now') # loss_mean = tf.placeholder(dtype=tf.float32, name='loss_mean') # loss_now_variable = loss_now # loss_mean_variable = loss_mean # train_loss = tf.summary.scalar('train_loss', loss_now_variable) # dev_loss_mean = tf.summary.scalar('dev_loss_mean', loss_mean_variable) # merged = tf.summary.merge([train_loss, dev_loss_mean]) config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=3) config.hidden_dropout_prob = args.dropout # Prepare model if args.do_train == 'yes': model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) if args.do_train == 'yes': print( '________________________now training______________________________' ) # Prepare data loader train_examples = read_examples(os.path.join(args.data_dir, 'train.csv'), is_training=True) train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, args.split_num, True) # print('train_feature_size=', train_features.__sizeof__()) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # print('train_data=',train_data[0]) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps) num_train_optimization_steps = args.train_steps # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.optimizer == 'RAdam': optimizer = RAdam(optimizer_grouped_parameters, lr=args.learning_rate) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=args.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 model.train() tr_loss = 0 loss_batch = 0 nb_tr_examples, nb_tr_steps = 0, 0 bar = tqdm(range(num_train_optimization_steps), total=num_train_optimization_steps) train_dataloader = cycle(train_dataloader) # with tf.Session() as sess: # summary_writer = tf.summary.FileWriter(tensorboard_log_dir, sess.graph) # sess.run(tf.global_variables_initializer()) list_loss_mean = [] bx = [] eval_F1 = [] ax = [] for step in bar: batch = next(train_dataloader) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() loss_batch += loss.item() train_loss = round( tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4) bar.set_description("loss {}".format(train_loss)) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: # optimizer.backward(loss) loss.backward() else: loss.backward() # draw loss every 500 docs if (step + 1) % int(500 / (args.train_batch_size / args.gradient_accumulation_steps)) == 0: list_loss_mean.append(round(loss_batch / 8.0, 4)) bx.append(step + 1) plt.plot(bx, list_loss_mean, label='loss_mean', linewidth=1, color='b', marker='o', markerfacecolor='green', markersize=2) plt.savefig(args.output_dir + '/labeled.jpg') loss_batch = 0 # paras update every batch data. if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 # report results every 200 real batch. if step % (args.eval_steps * args.gradient_accumulation_steps) == 0 and step > 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) # do evaluation totally 5 times during training stage. if args.do_eval == 'yes' and (step + 1) % int( num_train_optimization_steps / 5) == 0 and step > 0: for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] eval_examples = read_examples(os.path.join( args.data_dir, file), is_training=True) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field( eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field( eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_labels = np.concatenate(inference_labels, 0) inference_logits = np.concatenate(inference_logits, 0) model.train() ############################################### num_gold_0 = np.sum(gold_labels == 0) num_gold_1 = np.sum(gold_labels == 1) num_gold_2 = np.sum(gold_labels == 2) right_0 = 0 right_1 = 0 right_2 = 0 error_0 = 0 error_1 = 0 error_2 = 0 for gold_label, inference_label in zip( gold_labels, inference_labels): if gold_label == inference_label: if gold_label == 0: right_0 += 1 elif gold_label == 1: right_1 += 1 else: right_2 += 1 elif inference_label == 0: error_0 += 1 elif inference_label == 1: error_1 += 1 else: error_2 += 1 recall_0 = right_0 / (num_gold_0 + 1e-5) recall_1 = right_1 / (num_gold_1 + 1e-5) recall_2 = right_2 / (num_gold_2 + 1e-5) precision_0 = right_0 / (error_0 + right_0 + 1e-5) precision_1 = right_1 / (error_1 + right_1 + 1e-5) precision_2 = right_2 / (error_2 + right_2 + 1e-5) f10 = 2 * precision_0 * recall_0 / (precision_0 + recall_0 + 1e-5) f11 = 2 * precision_1 * recall_1 / (precision_1 + recall_1 + 1e-5) f12 = 2 * precision_2 * recall_2 / (precision_2 + recall_2 + 1e-5) output_dev_result_file = os.path.join( args.output_dir, "dev_results.txt") with open(output_dev_result_file, 'a', encoding='utf-8') as f: f.write('precision:' + str(precision_0) + ' ' + str(precision_1) + ' ' + str(precision_2) + '\n') f.write('recall:' + str(recall_0) + ' ' + str(recall_1) + ' ' + str(recall_2) + '\n') f.write('f1:' + str(f10) + ' ' + str(f11) + ' ' + str(f12) + '\n' + '\n') eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracy(inference_logits, gold_labels) # draw loss. eval_F1.append(round(eval_accuracy, 4)) ax.append(step) plt.plot(ax, eval_F1, label='eval_F1', linewidth=1, color='r', marker='o', markerfacecolor='blue', markersize=2) for a, b in zip(ax, eval_F1): plt.text(a, b, b, ha='center', va='bottom', fontsize=8) plt.savefig(args.output_dir + '/labeled.jpg') result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step, 'loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("more accurate model arises, now best F1 = ", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model, only save the model it-self model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join( args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) ''' if (step+1) / int(num_train_optimization_steps/10) > 9.5: print("=" * 80) print("End of training. Saving Model......") # Save a trained model, only save the model it-self model_to_save = model.module if hasattr(model, 'module') else model output_model_file = os.path.join(args.output_dir, "pytorch_model_final_step.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) ''' if args.do_test == 'yes': print( '___________________now testing for best eval f1 model_________________________' ) try: del model except: pass gc.collect() args.do_train = 'no' model = BertForSequenceClassification.from_pretrained(os.path.join( args.output_dir, "pytorch_model.bin"), args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from " "https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) for file, flag in [('test.csv', 'test')]: inference_labels = [] gold_labels = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=False) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() # print('test_logits=', logits) label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) if flag == 'dev': print(flag, accuracy(logits, gold_labels)) elif flag == 'test': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df['label_2'] = logits[:, 2] df[['id', 'label_0', 'label_1', 'label_2']].to_csv(os.path.join(args.output_dir, "sub.csv"), index=False) else: raise ValueError('flag not in [dev, test]') '''
def build(self): self.mmt_config = BertConfig(**self.config.mmt) self.mmt = MMT(self.mmt_config) self.so_to_mmt_in = nn.Linear(3 * 1536, self.mmt_config.hidden_size) self.st_to_mmt_in = nn.Linear(3 * 1536, self.mmt_config.hidden_size) self.so_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.st_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.so_drop = nn.Dropout(0.1) self.st_drop = nn.Dropout(0.1) self.linear_go_to_mmt_in = nn.Linear(2048, self.mmt_config.hidden_size) self.linear_gt_to_mmt_in = nn.Linear(300, self.mmt_config.hidden_size) self.go_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.gt_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.go_drop = nn.Dropout(0.1) self.gt_drop = nn.Dropout(0.1) self.linear_updated_ocr_to_mmt_in = nn.Linear( 300, self.mmt_config.hidden_size) self.updated_ocr_layer_norm = BertLayerNorm( self.mmt_config.hidden_size) self.updated_ocr_drop = nn.Dropout(self.config.ocr.dropout_prob) self.linear_joint = nn.Linear(1536, 768) self.answer_processor = registry.get(self._datasets[0] + "_answer_processor") self.ocr_ptr_net = OcrPtrNet(**self.config.classifier.ocr_ptr_net) # modules requiring custom learning rates (usually for finetuning) self.finetune_modules = [] self._build_txt_encoding() self._build_obj_encoding() self._build_ocr_encoding() self._init_text_embeddings("text") # init feature embedding for "image" setattr(self, "image_feature_dim", self.config["image_feature_dim"]) self.feature_embeddings_out_dim = 0 feature_attn_model_params = self.config["image_feature_embeddings"][0] feature_embedding = ImageEmbedding(getattr(self, "image_feature_dim"), self.text_embeddings_out_dim, **feature_attn_model_params) self.feature_embeddings_out_dim += feature_embedding.out_dim self.feature_embeddings_out_dim *= getattr(self, "image_feature_dim") setattr(self, "image_feature_embeddings_out_dim", self.feature_embeddings_out_dim) del self.feature_embeddings_out_dim setattr(self, "image_feature_embedding", feature_embedding) # init feature embedding for "context" setattr(self, "context_feature_dim", self.config["context_feature_dim"]) self.feature_embeddings_out_dim = 0 feature_attn_model_params = self.config["context_feature_embeddings"][ 0] feature_embedding = ImageEmbedding( getattr(self, "context_feature_dim"), self.text_embeddings_out_dim, **feature_attn_model_params) self.feature_embeddings_out_dim += feature_embedding.out_dim self.feature_embeddings_out_dim *= getattr(self, "context_feature_dim") setattr(self, "context_feature_embeddings_out_dim", self.feature_embeddings_out_dim) del self.feature_embeddings_out_dim setattr(self, "context_feature_embedding", feature_embedding) self._init_combine_layer("image", "text") num_choices = registry.get(self._datasets[0] + "_num_final_outputs") self.classifier = ClassifierLayer( self.config["classifier"]["type"], in_dim=768, out_dim=num_choices - 50, **self.config["classifier"]["params"])
def train(args): set_seed(args) # Set device if args.device == 'cuda': device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') logger.info('use cuda') else: device = torch.device('cpu') logger.info('use cpu') # Set label list for classification if args.num_label == 'multi': label_list = ['공포', '놀람', '분노', '슬픔', '중립', '행복', '혐오'] elif args.num_label == 'binary': label_list = ['긍정', '부정'] logger.info('use {} labels for training'.format(len(label_list))) # Load pretrained model and model configuration pretrained_path = os.path.join('./pretrained_model/', args.pretrained_type) if args.pretrained_model_path is None: # Use pretrained bert model(etri/skt) pretrained_model_path = os.path.join(pretrained_path, 'pytorch_model.bin') else: # Use further-pretrained bert model pretrained_model_path = args.pretrained_model_path logger.info('Pretrain Model : {}'.format(pretrained_model_path)) pretrained = torch.load(pretrained_model_path) if args.pretrained_type == 'skt' and 'bert.' not in list( pretrained.keys())[0]: logger.info('modify parameter names') # Change parameter name for consistency new_keys_ = ['bert.' + k for k in pretrained.keys()] old_values_ = pretrained.values() pretrained = {k: v for k, v in zip(new_keys_, old_values_)} bert_config = BertConfig( os.path.join(pretrained_path + '/bert_config.json')) bert_config.num_labels = len(label_list) model = BertForEmotionClassification(bert_config).to(device) model.load_state_dict(pretrained, strict=False) # Load Datasets tr_set = Datasets(file_path=args.train_data_path, label_list=label_list, pretrained_type=args.pretrained_type, max_len=args.max_len) # Use custom batch function collate_fn = ClassificationBatchFunction(args.max_len, tr_set.pad_idx, tr_set.cls_idx, tr_set.sep_idx) tr_loader = DataLoader(dataset=tr_set, batch_size=args.train_batch_size, shuffle=True, num_workers=8, pin_memory=True, collate_fn=collate_fn) dev_set = Datasets(file_path=args.dev_data_path, label_list=label_list, pretrained_type=args.pretrained_type, max_len=args.max_len) dev_loader = DataLoader(dataset=dev_set, batch_size=args.eval_batch_size, num_workers=8, pin_memory=True, drop_last=False, collate_fn=collate_fn) # optimizer optimizer = layerwise_decay_optimizer(model=model, lr=args.learning_rate, layerwise_decay=args.layerwise_decay) # lr scheduler t_total = len(tr_loader) // args.gradient_accumulation_steps * args.epochs warmup_steps = int(t_total * args.warmup_percent) logger.info('total training steps : {}, lr warmup steps : {}'.format( t_total, warmup_steps)) # Use gradual warmup and linear decay scheduler = optimization.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) # for low-precision training if args.fp16: try: from apex import amp logger.info('Use fp16') except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level, verbosity=0) # tensorboard setting save_path = "./model_saved_finetuning/lr{},batch{},total{},warmup{},len{},{}".format( args.learning_rate, args.train_batch_size * args.gradient_accumulation_steps, t_total, args.warmup_percent, args.max_len, args.pretrained_type) save_best_path = "./model" if not os.path.isdir(save_path): os.makedirs(save_path) writer = SummaryWriter(save_path) if not os.path.isdir(save_best_path): os.makedirs(save_best_path) # Save best model results with resultwriter result_writer = utils.ResultWriter("./model_saved_finetuning/results.csv") model.zero_grad() best_val_loss = 1e+9 global_step = 0 train_loss, train_acc, train_f1 = 0, 0, 0 logging_loss, logging_acc, logging_f1 = 0, 0, 0 logger.info('***** Training starts *****') total_result = [] for epoch in tqdm(range(args.epochs), desc='epochs'): for step, batch in tqdm(enumerate(tr_loader), desc='steps', total=len(tr_loader)): model.train() x_train, mask_train, y_train = map(lambda x: x.to(device), batch) inputs = { 'input_ids': x_train, 'attention_mask': mask_train, 'classification_label': y_train, } output, loss = model(**inputs) y_max = output.max(dim=1)[1] cr = classification_report(y_train.tolist(), y_max.tolist(), labels=list(range(len(label_list))), target_names=label_list, output_dict=True) # Get accuracy(micro f1) if 'micro avg' not in cr.keys(): batch_acc = list(cr.items())[len(label_list)][1] else: # If at least one of labels does not exists in mini-batch, use micro average instead batch_acc = cr['micro avg']['f1-score'] # macro f1 batch_macro_f1 = cr['macro avg']['f1-score'] # accumulate measures grad_accu = args.gradient_accumulation_steps if grad_accu > 1: loss /= grad_accu batch_acc /= grad_accu batch_macro_f1 /= grad_accu if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() train_loss += loss.item() train_acc += batch_acc train_f1 += batch_macro_f1 if (global_step + 1) % grad_accu == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.grad_clip_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip_norm) optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if global_step % args.logging_step == 0: acc_ = (train_acc - logging_acc) / args.logging_step f1_ = (train_f1 - logging_f1) / args.logging_step loss_ = (train_loss - logging_loss) / args.logging_step writer.add_scalars('loss', {'train': loss_}, global_step) writer.add_scalars('acc', {'train': acc_}, global_step) writer.add_scalars('macro_f1', {'train': f1_}, global_step) logger.info( '[{}/{}], trn loss : {:.3f}, trn acc : {:.3f}, macro f1 : {:.3f}' .format(global_step, t_total, loss_, acc_, f1_)) logging_acc, logging_f1, logging_loss = train_acc, train_f1, train_loss # Get f1 score for each label f1_results = [(l, r['f1-score']) for i, (l, r) in enumerate(cr.items()) if i < len(label_list)] f1_log = "\n".join( ["{} : {}".format(l, f) for l, f in f1_results]) logger.info("\n\n***f1-score***\n" + f1_log + "\n\n***confusion matrix***\n{}".format( confusion_matrix(y_train.tolist(), y_max.tolist()))) # Validation val_loss, val_acc, val_macro_f1, _ = evaluate(args, dev_loader, model, device) val_result = '[{}/{}] val loss : {:.3f}, val acc : {:.3f}. val macro f1 : {:.3f}'.format( global_step, t_total, val_loss, val_acc, val_macro_f1) writer.add_scalars('loss', {'val': val_loss}, global_step) writer.add_scalars('acc', {'val': val_acc}, global_step) writer.add_scalars('macro_f1', {'val': val_macro_f1}, global_step) logger.info(val_result) total_result.append(val_result) if val_loss < best_val_loss: # Save model checkpoints torch.save(model.state_dict(), os.path.join(save_best_path, 'best_model.bin')) torch.save(args, os.path.join(save_best_path, 'training_args.bin')) # azure에 저장 # fc.compress_model(DIRNAME) # fc.upload_file(azs, constant.STORAGE_BERT_SENTIMENT_DIR, constant.MODEL_ZIP_NAME, MODEL_ZIP_ABS_PATH) # fc.upload_file(azs, constant.STORAGE_BERT_SENTIMENT_DIR, constant.VERSION_FILE, VERSION_FILE_ABS_PATH) logger.info('Saving model checkpoint to %s', save_path) # logger.info('Uploading model azure') best_val_loss = val_loss best_val_acc = val_acc best_val_macro_f1 = val_macro_f1 # Save results in 'model_saved_finetuning/results.csv' results = { 'val_loss': best_val_loss, 'val_acc': best_val_acc, 'val_macro_f1': best_val_macro_f1, 'save_dir': save_path, 'save_best_dir': save_best_path, 'pretrained_path': pretrained_path, } result_writer.update(args, **results) return global_step, loss_, acc_, best_val_loss, best_val_acc, total_result
def train(args): set_seed(args) # Set device if args.device == 'cuda': device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') logger.info('use cuda') else: device = torch.device('cpu') logger.info('use cpu') # Load pretrained model and model configuration pretrained_path = os.path.join('./pretrained_model/', args.pretrained_type) if args.pretrained_model_path is None: # Use pretrained bert model(etri/skt) pretrained_model_path = os.path.join(pretrained_path, 'pytorch_model.bin') else: # Use further-pretrained bert model pretrained_model_path = args.pretrained_model_path logger.info('Pretrain Model : {}'.format(pretrained_model_path)) pretrained = torch.load(pretrained_model_path) if args.pretrained_type == 'skt' and 'bert.' not in list( pretrained.keys())[0]: logger.info('modify parameter names') # Change parameter name for consistency new_keys_ = ['bert.' + k for k in pretrained.keys()] old_values_ = pretrained.values() pretrained = {k: v for k, v in zip(new_keys_, old_values_)} bert_config = BertConfig( os.path.join(pretrained_path + '/bert_config.json')) model = BertForMLM(bert_config).to(device) model.load_state_dict(pretrained, strict=False) # Load Datasets tr_set = Datasets(file_path=args.train_data_path, pretrained_type=args.pretrained_type, max_len=args.max_len) # Use custom batch function collate_fn = MLMBatchFunction(args.max_len, tr_set.vocab) tr_loader = DataLoader(dataset=tr_set, batch_size=args.train_batch_size, shuffle=True, num_workers=8, pin_memory=True, drop_last=True, collate_fn=collate_fn) if args.do_eval: dev_set = Datasets(file_path=args.dev_data_path, pretrained_type=args.pretrained_type, max_len=args.max_len) dev_loader = DataLoader(dataset=dev_set, batch_size=args.eval_batch_size, num_workers=8, pin_memory=True, drop_last=False, collate_fn=collate_fn) # optimizer optimizer = layerwise_decay_optimizer(model=model, lr=args.learning_rate, layerwise_decay=args.layerwise_decay) # lr scheduler t_total = len(tr_loader) // args.gradient_accumulation_steps * args.epochs warmup_steps = int(t_total * args.warmup_percent) logger.info('total training steps : {}, lr warmup steps : {}'.format( t_total, warmup_steps)) # Use gradual warmup and cosine decay scheduler = optimization.WarmupCosineWithHardRestartsSchedule( optimizer, warmup_steps=warmup_steps, t_total=t_total) # for low-precision training if args.fp16: try: from apex import amp logger.info('Use fp16') except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level, verbosity=0) # tensorboard setting save_path = "./model_saved_pretrain/lr{},batch{},total{},warmup{},len{},{}".format( args.learning_rate, args.train_batch_size * args.gradient_accumulation_steps, t_total, args.warmup_percent, args.max_len, args.pretrained_type) if not os.path.isdir(save_path): os.makedirs(save_path) writer = SummaryWriter(save_path) # Save best model results with resultwriter result_writer = utils.ResultWriter("./model_saved_pretrain/results.csv") model.zero_grad() best_val_loss = 1e+9 best_val_acc = 0 global_step = 0 train_loss, train_acc = 0, 0 val_loss, val_acc = 0, 0 logging_loss, logging_acc = 0, 0 logger.info('***** Training starts *****') total_result = [] for epoch in tqdm(range(args.epochs), desc='epochs'): for step, batch in tqdm(enumerate(tr_loader), desc='steps', total=len(tr_loader)): model.train() x_train, y_train, mask_train = map(lambda x: x.to(device), batch) inputs = { 'input_ids': x_train, 'attention_mask': mask_train, 'masked_lm_labels': y_train, } output, loss = model(**inputs) y_max = output.max(dim=2)[1] # Get accuracy for maked tokens total_length = torch.ones_like(y_train).masked_fill( y_train == -1, 0).sum().item() total_sum = torch.zeros_like(y_max).masked_fill( y_max == y_train, 1).sum().item() batch_acc = total_sum / total_length # accumulate measures grad_accu = args.gradient_accumulation_steps if grad_accu > 1: loss = loss / grad_accu batch_acc = batch_acc / grad_accu if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() train_loss += loss.item() train_acc += batch_acc if (step + 1) % grad_accu == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.grad_clip_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip_norm) optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if global_step % args.logging_step == 0: acc_ = (train_acc - logging_acc) / args.logging_step loss_ = (train_loss - logging_loss) / args.logging_step writer.add_scalars('loss', {'train': loss_}, global_step) writer.add_scalars('acc', {'train': acc_}, global_step) writer.add_scalars('lr', {'lr': scheduler.get_lr()[0]}, global_step) logger.info( '[{}/{}], trn loss : {:.3f}, trn acc : {:.3f}'.format( global_step, t_total, loss_, acc_)) logging_acc, logging_loss = train_acc, train_loss if args.do_eval: # Validation val_loss, val_acc = evaluate(args, dev_loader, model, device) val_result = '[{}/{}] val loss : {:.3f}, val acc : {:.3f}'.format( global_step, t_total, val_loss, val_acc) logger.info(val_result) total_result.append(val_result) if val_loss <= best_val_loss: # Save model checkpoints torch.save(model.state_dict(), os.path.join(save_path, 'best_model.bin')) torch.save(args, os.path.join(save_path, 'training_args.bin')) logger.info('Saving model checkpoint to %s', save_path) best_val_loss = val_loss best_val_acc = val_acc if (epoch + 1) % args.saving_step == 0: torch.save( model.state_dict(), os.path.join(save_path, 'epoch{}_model.bin'.format(epoch + 1))) # Save results in 'model_saved_pretrain/results.csv' results = { 'train_loss': loss_, 'train_acc': acc_, 'val_loss': best_val_loss, 'val_acc': best_val_acc, 'save_dir': save_path, 'global_step': global_step, } result_writer.update(args, **results) return global_step, loss_, acc_, best_val_loss, best_val_acc, total_result
def test_eval(self): data = DATAMultiWOZ( debug=False, data_dir=self.data_dir ) test_examples = data.read_examples(os.path.join(self.data_dir, 'test.json')) print('eval_examples的数量', len(test_examples)) dialogueID = [] truth = defaultdict(set) for each in test_examples: dialogueID.append(each.guid) truth[each.guid] = each.turn_belief test_features = data.convert_examples_to_features(test_examples, self.tokenizer, self.max_seq_length) test_input_ids = torch.tensor(data.select_field(test_features, 'input_ids'), dtype=torch.long) test_input_mask = torch.tensor(data.select_field(test_features, 'input_mask'), dtype=torch.long) test_segment_ids = torch.tensor(data.select_field(test_features, 'segment_ids'), dtype=torch.long) test_utterance_mask = torch.tensor(data.select_field(test_features, 'utterance_mask'), dtype=torch.long) test_domainslot_mask = torch.tensor(data.select_field(test_features, 'domainslot_mask'), dtype=torch.long) test_label_tokens_start = torch.tensor([f.label_tokens_start for f in test_features], dtype=torch.long) test_label_tokens_end = torch.tensor([f.label_tokens_end for f in test_features], dtype=torch.long) test_label_sentence_domainslot = torch.tensor([f.label_sentence_domainslot for f in test_features], dtype=torch.long) test_label_tokens_domainslot = torch.tensor([f.label_tokens_domainslot for f in test_features], dtype=torch.long) test_hist_tokens = [f.hist_token for f in test_features] test_data = TensorDataset( test_input_ids, test_input_mask, test_segment_ids, test_utterance_mask, test_domainslot_mask, test_label_tokens_start, test_label_tokens_end, test_label_sentence_domainslot, test_label_tokens_domainslot ) # Run prediction for full data test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=self.eval_batch_size) config = BertConfig.from_pretrained(self.model_name_or_path) model = BertForTokenClassification.from_pretrained( os.path.join(self.output_dir, "pytorch_model.bin"), self.args, config=config) model.to(self.device) model.eval() gold_labels_tokens_start = [] gold_labels_tokens_end = [] gold_label_sentence_domainslot = [] gold_label_tokens_domainslot = [] scores_tokens_start = [] scores_tokens_end = [] scores_sentence_domainslot = [] scores_tokens_domainslot = [] for input_ids, input_mask, segment_ids, \ utterance_mask, domainslot_mask, \ label_tokens_start, label_tokens_end, \ label_sentence_domainslot, label_tokens_domainslot in test_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) utterance_mask = utterance_mask.to(self.device) domainslot_mask = domainslot_mask.to(self.device) label_tokens_start = label_tokens_start.to(self.device) label_tokens_end = label_tokens_end.to(self.device) label_sentence_domainslot = label_sentence_domainslot.to(self.device) # print(label_sentence_domainslot.size()) # print(label_sentence_domainslot) label_tokens_domainslot = label_tokens_domainslot.to(self.device) with torch.no_grad(): batch_eval_loss_tokens_start, batch_eval_loss_tokens_end, batch_eval_loss_sentence_domainslot, batch_eval_loss_tokens_domainslot = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, utterance_mask=utterance_mask, domainslot_mask=domainslot_mask, label_tokens_start=label_tokens_start, label_tokens_end=label_tokens_end, label_sentence_domainslot=label_sentence_domainslot, label_tokens_domainslot=label_tokens_domainslot ) logits_tokens_start, logits_tokens_end, logits_sentence_domainslot, logits_tokens_domainslot = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, utterance_mask=utterance_mask, domainslot_mask=domainslot_mask ) logits_tokens_start = logits_tokens_start.view(-1, 2).cpu().numpy() logits_tokens_end = logits_tokens_end.view(-1, 2).cpu().numpy() logits_tokens_domainslot = logits_tokens_domainslot.view(-1, 2).detach().cpu().numpy() logits_sentence_domainslot = logits_sentence_domainslot.view(-1, 2).cpu().numpy() label_tokens_start = label_tokens_start.view(-1).to('cpu').numpy() label_tokens_end = label_tokens_end.view(-1).to('cpu').numpy() label_sentence_domainslot = label_sentence_domainslot.to('cpu').numpy() label_tokens_domainslot = label_tokens_domainslot.to('cpu').numpy() scores_tokens_start.append(logits_tokens_start) scores_tokens_end.append(logits_tokens_end) scores_sentence_domainslot.append(logits_sentence_domainslot) scores_tokens_domainslot.append(logits_tokens_domainslot) gold_labels_tokens_start.append(label_tokens_start) gold_labels_tokens_end.append(label_tokens_end) gold_label_sentence_domainslot.append(label_sentence_domainslot) gold_label_tokens_domainslot.append(label_tokens_domainslot) gold_labels_tokens_start = np.concatenate(gold_labels_tokens_start, 0) gold_labels_tokens_end = np.concatenate(gold_labels_tokens_end, 0) gold_label_sentence_domainslot = np.concatenate(gold_label_sentence_domainslot, 0) gold_label_tokens_domainslot = np.concatenate(gold_label_tokens_domainslot, 0) scores_tokens_start = np.concatenate(scores_tokens_start, 0) scores_tokens_end = np.concatenate(scores_tokens_end, 0) scores_sentence_domainslot = np.concatenate(scores_sentence_domainslot, 0) scores_tokens_domainslot = np.concatenate(scores_tokens_domainslot, 0) # 计算评价指标 # eval_accuracy_domain = accuracyF1(scores_domain, gold_labels_domain,mode='domain',report=True) # eval_accuracy_dependcy = accuracyF1(scores_dependcy, gold_labels_dependcy,mode='dependcy',report=True) eval_F1_tokenstart, eval_F1_tokenend, F1_sentence_domainslot, F1_token_domainslot = compute_jointGoal_domainslot_1_( dialogueID, truth, test_hist_tokens, scores_tokens_start, scores_tokens_end, scores_sentence_domainslot, scores_tokens_domainslot, gold_labels_tokens_start, gold_labels_tokens_end, gold_label_sentence_domainslot, gold_label_tokens_domainslot ) print( 'F1_token_domainslot', F1_token_domainslot, 'F1_sentence_domainslot', F1_sentence_domainslot, 'eval_F1_tokenstart', eval_F1_tokenstart, 'eval_F1_tokenend', eval_F1_tokenend )
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--from_pretrained", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--output_dir", default="results", type=str, help= "The output directory where the model checkpoints will be written.", ) parser.add_argument( "--config_file", default="config/bert_config.json", type=str, help="The config file which specified the model details.", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--do_lower_case", default=True, type=bool, help= "Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus", ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument( "--num_workers", type=int, default=16, help="Number of workers in the dataloader.", ) parser.add_argument("--save_name", default="", type=str, help="save name for training.") parser.add_argument( "--use_chunk", default=0, type=float, help="whether use chunck for parallel training.", ) parser.add_argument("--batch_size", default=30, type=int, help="what is the batch size?") parser.add_argument("--tasks", default="", type=str, help="1-2-3... training task separate by -") parser.add_argument( "--in_memory", default=False, type=bool, help="whether use chunck for parallel training.", ) parser.add_argument("--baseline", action="store_true", help="whether use single stream baseline.") parser.add_argument("--split", default="", type=str, help="which split to use.") parser.add_argument( "--dynamic_attention", action="store_true", help="whether use dynamic attention.", ) parser.add_argument( "--clean_train_sets", default=True, type=bool, help="whether clean train sets for multitask data.", ) parser.add_argument( "--visual_target", default=0, type=int, help="which target to use for visual branch. \ 0: soft label, \ 1: regress the feature, \ 2: NCE loss.", ) parser.add_argument( "--task_specific_tokens", action="store_true", help="whether to use task specific tokens for the multi-task learning.", ) args = parser.parse_args() with open("vilbert_tasks.yml", "r") as f: task_cfg = edict(yaml.safe_load(f)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.baseline: from pytorch_transformers.modeling_bert import BertConfig from vilbert.basebert import BaseBertForVLTasks else: from vilbert.vilbert import BertConfig from vilbert.vilbert import VILBertForVLTasks task_names = [] for i, task_id in enumerate(args.tasks.split("-")): task = "TASK" + task_id name = task_cfg[task]["name"] task_names.append(name) # timeStamp = '-'.join(task_names) + '_' + args.config_file.split('/')[1].split('.')[0] timeStamp = args.from_pretrained.split("/")[-1] + "-" + args.save_name savePath = os.path.join(args.output_dir, timeStamp) config = BertConfig.from_json_file(args.config_file) if args.task_specific_tokens: config.task_specific_tokens = True if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True if default_gpu and not os.path.exists(savePath): os.makedirs(savePath) task_batch_size, task_num_iters, task_ids, task_datasets_val, task_dataloader_val = LoadDatasetEval( args, task_cfg, args.tasks.split("-")) tbLogger = utils.tbLogger( timeStamp, savePath, task_names, task_ids, task_num_iters, 1, save_logger=False, txt_name="eval.txt", ) num_labels = max( [dataset.num_labels for dataset in task_datasets_val.values()]) if args.dynamic_attention: config.dynamic_attention = True if "roberta" in args.bert_model: config.model = "roberta" if args.visual_target == 0: config.v_target_size = 1601 config.visual_target = args.visual_target else: config.v_target_size = 2048 config.visual_target = args.visual_target if args.task_specific_tokens: config.task_specific_tokens = True if args.baseline: model = BaseBertForVLTasks.from_pretrained( args.from_pretrained, config=config, num_labels=num_labels, default_gpu=default_gpu, ) else: model = VILBertForVLTasks.from_pretrained( args.from_pretrained, config=config, num_labels=num_labels, default_gpu=default_gpu, ) task_losses = LoadLosses(args, task_cfg, args.tasks.split("-")) model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, delay_allreduce=True) elif n_gpu > 1: model = nn.DataParallel(model) print("***** Running evaluation *****") print(" Num Iters: ", task_num_iters) print(" Batch size: ", task_batch_size) model.eval() # when run evaluate, we run each task sequentially. for task_id in task_ids: results = [] others = [] for i, batch in enumerate(task_dataloader_val[task_id]): loss, score, batch_size, results, others = EvaluatingModel( args, task_cfg, device, task_id, batch, model, task_dataloader_val, task_losses, results, others, ) tbLogger.step_val(0, float(loss), float(score), task_id, batch_size, "val") sys.stdout.write("%d/%d\r" % (i, len(task_dataloader_val[task_id]))) sys.stdout.flush() # save the result or evaluate the result. ave_score = tbLogger.showLossVal(task_id) if args.split: json_path = os.path.join(savePath, args.split) else: json_path = os.path.join(savePath, task_cfg[task_id]["val_split"]) json.dump(results, open(json_path + "_result.json", "w")) json.dump(others, open(json_path + "_others.json", "w"))
def __init__(self, config): super().__init__(config) self.mmt_config = BertConfig(**self.config.mmt) self._datasets = registry.get("config").datasets.split(",") repetition_mask = None self.coverage_ratio = -1e10
def main(): batch_size = 32 max_seq_len = 128 n_epochs = 3 bert_model = 'bert-base-uncased' learning_rate = 3e-5 adam_epsilon = 1e-8 warmup_steps = 0 num_labels = 1 output_dir = "fine_tuned--{0}--SEQ_LEN={1}--BATCH_SIZE={2}--HEAD={3}".format( bert_model, max_seq_len, batch_size, num_labels) dataset_dir = "dataset\custom_training_set.csv" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") config = BertConfig.from_pretrained(bert_model) config.num_labels = num_labels tokenizer = BertTokenizer.from_pretrained(bert_model) model = BertForSequenceClassification(config) model.to(device) train_dataset = Dataset(dataset_dir, tokenizer, max_seq_len) num_train_optimization_steps = int( len(train_dataset) / batch_size) * n_epochs param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) train_sampler = data.RandomSampler(train_dataset) train_dataloader = data.DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size) model.train() for _ in trange(n_epochs, desc="Epoch"): for batch in tqdm(train_dataloader, desc="Iteration"): batch = (t.to(device) for t in batch) input_ids, input_mask, segment_ids, labels = batch outputs = model(input_ids, input_mask, segment_ids, labels) loss = outputs[0] loss.backward() scheduler.step() optimizer.step() optimizer.zero_grad() if not os.path.exists(output_dir): os.mkdir(output_dir) model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir)
def test_eval(self): data = DATAMultiWOZ( debug=False, data_dir=self.data_dir ) test_examples = data.read_examples(os.path.join(self.data_dir, 'test.json')) print('eval_examples的数量', len(test_examples)) dialogueID = [x.guid for x in test_examples] utterance_text = [x.text_history for x in test_examples] test_features = data.convert_examples_to_features(test_examples, self.tokenizer, self.max_seq_length) all_input_ids = torch.tensor(data.select_field(test_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(data.select_field(test_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(data.select_field(test_features, 'segment_ids'), dtype=torch.long) eval_labels_domainslot = torch.tensor([f.labels_domainslot for f in test_features], dtype=torch.float) eval_labels_domain = torch.tensor([f.labels_domain for f in test_features], dtype=torch.long) eval_labels_dependcy = torch.tensor([f.labels_dependcy for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,eval_labels_domainslot,eval_labels_domain,eval_labels_dependcy) # Run prediction for full data test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=self.eval_batch_size) config = BertConfig.from_pretrained(self.model_name_or_path) model = BertForTokenClassification.from_pretrained( os.path.join(self.output_dir, "pytorch_model.bin"), self.args, config=config) model.to(self.device) model.eval() inference_labels = [] gold_labels_domain = [] gold_labels_dependcy = [] gold_labels_domainslot = [] scores_domainslot = [] scores_domain = [] scores_dependcy = [] for input_ids, input_mask, segment_ids,label_domainslot,label_domain,label_dependcy in test_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_domainslot = label_domainslot.to(self.device) label_domain = label_domain.to(self.device) label_dependcy = label_dependcy.to(self.device) with torch.no_grad(): logits_domainslot,logits_domain,logits_dependcy = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask ) logits_domainslot = torch.sigmoid(logits_domainslot) logits_domainslot = (logits_domainslot > 0.4).float() logits_domainslot = logits_domainslot.cpu().long().numpy() logits_domain = logits_domain.view(-1, self.num_labels_domain).cpu().numpy() logits_dependcy = logits_dependcy.view(-1, self.num_labels_dependcy).cpu().numpy() label_domainslot = label_domainslot.to('cpu').numpy() label_domain = label_domain.view(-1).to('cpu').numpy() label_dependcy = label_dependcy.view(-1).to('cpu').numpy() scores_domainslot.append(logits_domainslot) scores_domain.append(logits_domain) scores_dependcy.append(logits_dependcy) gold_labels_domainslot.append(label_domainslot) gold_labels_domain.append(label_domain) gold_labels_dependcy.append(label_dependcy) gold_labels_domainslot = np.concatenate(gold_labels_domainslot, 0) gold_labels_domain = np.concatenate(gold_labels_domain, 0) gold_labels_dependcy = np.concatenate(gold_labels_dependcy, 0) scores_domainslot = np.concatenate(scores_domainslot, 0) scores_domain = np.concatenate(scores_domain, 0) scores_dependcy = np.concatenate(scores_dependcy, 0) # 计算评价指标 assert scores_domain.shape[0] == scores_dependcy.shape[0] == gold_labels_domain.shape[0] == gold_labels_dependcy.shape[0] eval_accuracy_domain = accuracyF1(scores_domain, gold_labels_domain,mode='domain',report=True) eval_accuracy_dependcy = accuracyF1(scores_dependcy, gold_labels_dependcy,mode='dependcy',report=True) eval_jointGoal = compute_jointGoal_domainslot( dialogueID, utterance_text, scores_domainslot, gold_labels_domainslot, scores_domain, gold_labels_domain, scores_dependcy, gold_labels_dependcy ) print('eval_accuracy_domain',eval_accuracy_domain) print('eval_accuracy_dependcy', eval_accuracy_dependcy) print('eval_jointGoal', eval_jointGoal)
# if os.path.exists(data_path) is not True: # os.mkdir(data_path) # make data, 如果train_path或者dev_path都存在,通过data_path做数据 # if (os.path.exists(train_path) and os.path.exists(dev_path)) is not True: # make_data(data_path, train_path, dev_path) if args.limit_vocab: vocab_path = os.path.join(args.folder_path, args.limit_vocabulary_name) else: vocab_path = os.path.join(args.folder_path, args.vocab_name) # 创建分词器 # 使用vocab文件进行pretrain的时候出了错误,输出的vocab成为了乱码 tokenizer = BertTokenizer.from_pretrained(vocab_path) config_path = os.path.join(args.folder_path, args.config_name) config = BertConfig.from_pretrained(config_path) if args.limit_vocab: config.vocab_size = tokenizer.vocab_size rouge = Rouge() # if os.path.exists(save_path) is not True: # os.mkdir(save_path) dev_data = pd.read_csv(dev_path, encoding='utf-8') dev_texts = list(dev_data['text'].values) dev_summaries = list(dev_data['summarization'].values) summary_all = [] for summary in dev_summaries: summary = ''.join([word + ' ' for words in summary for word in words])
def train(self): if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) # logger.info(f'Fold {split_index + 1}') train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader() num_train_optimization_steps = self.train_steps # Prepare model config = BertConfig.from_pretrained(self.model_name_or_path) model = BertForTokenClassification.from_pretrained(self.model_name_or_path,self.args, config=config) model.to(self.device) model.train() # Prepare optimizer param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': self.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", self.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 best_MRR = 0 tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 train_dataloader = cycle(train_dataloader) for step in range(num_train_optimization_steps): batch = next(train_dataloader) batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids,label_domainslot, label_domain,label_dependcy = batch loss_domainslot,loss_domain,loss_dependcy = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, label_domainslot = label_domainslot, label_domain=label_domain, label_dependcy = label_dependcy ) loss = loss_domainslot+loss_domain+loss_dependcy tr_loss += loss.item() train_loss = round(tr_loss / (nb_tr_steps + 1), 4) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() scheduler.step() global_step += 1 if (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if self.do_eval and (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: for file in ['dev.csv']: inference_labels = [] gold_labels_domainslot = [] gold_labels_domain = [] gold_labels_dependcy = [] inference_logits = [] scores_domainslot = [] scores_domain = [] scores_dependcy = [] # ID = [x.guid for x in eval_examples] dialogueID = [x.guid for x in eval_examples] utterance_text = [x.text_eachturn for x in eval_examples] logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", self.eval_batch_size) model.eval() eval_loss_domainslot,eval_loss_domain,eval_loss_dependcy = 0,0,0 eval_accuracy_domainslot,eval_accuracy_domain,eval_accuracy_dependcy = 0,0,0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids,label_domainslot,label_domain,label_dependcy in eval_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_domainslot = label_domainslot.to(self.device) label_domain = label_domain.to(self.device) label_dependcy = label_dependcy.to(self.device) with torch.no_grad(): batch_eval_loss_domainslot,batch_eval_loss_domain,batch_eval_loss_dependcy = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, label_domainslot = label_domainslot, label_domain=label_domain, label_dependcy=label_dependcy ) logits_domainslot,logits_domain,logits_dependcy = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask ) logits_domainslot = torch.sigmoid(logits_domainslot) logits_domainslot = (logits_domainslot > 0.4).float() logits_domainslot = logits_domainslot.cpu().long().numpy() logits_domain = logits_domain.view(-1, self.num_labels_domain).detach().cpu().numpy() logits_dependcy = logits_dependcy.view(-1, self.num_labels_dependcy).detach().cpu().numpy() label_domainslot = label_domainslot.to('cpu').numpy() label_domain = label_domain.view(-1).to('cpu').numpy() label_dependcy = label_dependcy.view(-1).to('cpu').numpy() scores_domainslot.append(logits_domainslot) scores_domain.append(logits_domain) scores_dependcy.append(logits_dependcy) gold_labels_domainslot.append(label_domainslot) gold_labels_domain.append(label_domain) gold_labels_dependcy.append(label_dependcy) eval_loss_domainslot += batch_eval_loss_domainslot.mean().item() eval_loss_domain += batch_eval_loss_domain.mean().item() eval_loss_dependcy += batch_eval_loss_dependcy.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels_domainslot = np.concatenate(gold_labels_domainslot, 0) gold_labels_domain = np.concatenate(gold_labels_domain, 0) gold_labels_dependcy = np.concatenate(gold_labels_dependcy, 0) scores_domainslot = np.concatenate(scores_domainslot, 0) scores_domain = np.concatenate(scores_domain, 0) scores_dependcy = np.concatenate(scores_dependcy, 0) model.train() eval_loss_domainslot = eval_loss_domainslot/nb_eval_steps eval_loss_domain = eval_loss_domain / nb_eval_steps eval_loss_dependcy = eval_loss_dependcy / nb_eval_steps # print(scores_domainslot.shape) # print(gold_labels_domainslot.shape) # print(scores_domainslot) # print(gold_labels_domainslot) # exit() eval_accuracy_domain = accuracyF1(scores_domain, gold_labels_domain,mode='domain') eval_accuracy_dependcy = accuracyF1(scores_dependcy, gold_labels_dependcy ,mode= 'dependcy') eval_jointGoal_domainslot = compute_jointGoal_domainslot( dialogueID, utterance_text, scores_domainslot, gold_labels_domainslot, scores_domain, gold_labels_domain, scores_dependcy, gold_labels_dependcy ) print( 'eval_jointGoal_domainslot',eval_jointGoal_domainslot, 'eval_F1_domain',eval_accuracy_domain, 'eval_F1_dependcy', eval_accuracy_dependcy, 'global_step',global_step, 'loss',train_loss ) result = { 'eval_jointGoal_domainslot':eval_jointGoal_domainslot, 'eval_loss_domainslot':eval_loss_domainslot, 'eval_loss_domain': eval_loss_domain, 'eval_loss_dependcy':eval_loss_dependcy, 'eval_F1_domain': eval_accuracy_domain, 'eval_F1_dependcy': eval_accuracy_dependcy, 'global_step': global_step, 'loss': train_loss} output_eval_file = os.path.join(self.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy_domain > best_acc : print("=" * 80) print("Best F1", eval_accuracy_domain) print("Saving Model......") # best_acc = eval_accuracy best_acc = eval_accuracy_domain # Save a trained model model_to_save = model.module if hasattr(model,'module') else model output_model_file = os.path.join(self.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80)
def __init__(self, args=None, labels=None, device='cuda', bert_model_path='bert-base-uncased', architecture="DocumentBertLSTM", batch_size=10, bert_batch_size=7, learning_rate=5e-5, weight_decay=0, use_tensorboard=False): if args is not None: self.args = vars(args) if not args: self.args = {} self.args['bert_model_path'] = bert_model_path self.args['device'] = device self.args['learning_rate'] = learning_rate self.args['weight_decay'] = weight_decay self.args['batch_size'] = batch_size self.args['labels'] = labels self.args['bert_batch_size'] = bert_batch_size self.args['architecture'] = architecture self.args['use_tensorboard'] = use_tensorboard if 'fold' not in self.args: self.args['fold'] = 0 assert self.args[ 'labels'] is not None, "Must specify all labels in prediction" self.log = logging.getLogger() self.bert_tokenizer = BertTokenizer.from_pretrained( self.args['bert_model_path']) #account for some random tensorflow naming scheme if os.path.exists(self.args['bert_model_path']): if os.path.exists( os.path.join(self.args['bert_model_path'], CONFIG_NAME)): config = BertConfig.from_json_file( os.path.join(self.args['bert_model_path'], CONFIG_NAME)) elif os.path.exists( os.path.join(self.args['bert_model_path'], 'bert_config.json')): config = BertConfig.from_json_file( os.path.join(self.args['bert_model_path'], 'bert_config.json')) else: raise ValueError( "Cannot find a configuration for the BERT based model you are attempting to load." ) else: config = BertConfig.from_pretrained(self.args['bert_model_path']) config.__setattr__('num_labels', len(self.args['labels'])) config.__setattr__('bert_batch_size', self.args['bert_batch_size']) if 'use_tensorboard' in self.args and self.args['use_tensorboard']: assert 'model_directory' in self.args is not None, "Must have a logging and checkpoint directory set." from torch.utils.tensorboard import SummaryWriter self.tensorboard_writer = SummaryWriter( os.path.join( self.args['model_directory'], "..", "runs", self.args['model_directory'].split(os.path.sep)[-1] + '_' + self.args['architecture'] + '_' + str(self.args['fold']))) self.bert_doc_classification = document_bert_architectures[ self.args['architecture']].from_pretrained( self.args['bert_model_path'], config=config) self.bert_doc_classification.freeze_bert_encoder() self.bert_doc_classification.unfreeze_bert_encoder_last_layers() self.optimizer = torch.optim.Adam( self.bert_doc_classification.parameters(), weight_decay=self.args['weight_decay'], lr=self.args['learning_rate'])
# tokenizer ptr_tokenizer = BertTokenizer.from_pretrained( 'pretrained/vocab.korean.rawtext.list', do_lower_case=False) with open('pretrained/vocab.pkl', mode='rb') as io: vocab = pickle.load(io) pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices(vocab.padding_token)) preprocessor = PreProcessor(vocab=vocab, split_fn=ptr_tokenizer.tokenize, pad_fn=pad_sequence) # model (restore) checkpoint_manager = CheckpointManager(model_dir) checkpoint = checkpoint_manager.load_checkpoint(args.restore_file + '.tar') config = BertConfig('pretrained/bert_config.json') model = PairwiseClassifier(config, num_classes=model_config.num_classes, vocab=preprocessor.vocab) model.load_state_dict(checkpoint['model_state_dict']) # evaluation filepath = getattr(data_config, args.data_name) ds = Corpus(filepath, preprocessor.preprocess) dl = DataLoader(ds, batch_size=model_config.batch_size, num_workers=4) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) summary_manager = SummaryManager(model_dir)
def test_eval(self): data = DATAMultiWOZ(debug=False, data_dir=self.data_dir) test_examples = data.read_examples( os.path.join(self.data_dir, 'test.json')) print('eval_examples的数量', len(test_examples)) ID = [x.guid for x in test_examples] test_features = data.convert_examples_to_features( test_examples, self.tokenizer, self.max_seq_length) all_input_ids = torch.tensor(data.select_field(test_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(data.select_field( test_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(data.select_field( test_features, 'segment_ids'), dtype=torch.long) eval_labels_domain = torch.tensor( [f.labels_domain for f in test_features], dtype=torch.long) eval_labels_dependcy = torch.tensor( [f.labels_dependcy for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, eval_labels_domain, eval_labels_dependcy) # Run prediction for full data test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=self.eval_batch_size) config = BertConfig.from_pretrained(self.model_name_or_path, num_labels=self.num_labels) model = BertForTokenClassification.from_pretrained(os.path.join( self.output_dir, "pytorch_model.bin"), self.args, config=config) model.to(self.device) model.eval() inference_labels = [] gold_labels = [] scores = [] for input_ids, input_mask, segment_ids, label_domain, label_dependcy in test_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_domain = label_domain.to(self.device) label_dependcy = label_dependcy.to(self.device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, ).view(-1, self.num_labels).detach().cpu().numpy() label_domain = label_domain.view(-1).to('cpu').numpy() scores.append(logits) inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_domain) gold_labels = np.concatenate(gold_labels, 0) scores = np.concatenate(scores, 0) # 计算评价指标 print(scores.shape) print(gold_labels.shape) assert scores.shape[0] == gold_labels.shape[0] eval_accuracy = accuracyF1(scores, gold_labels) # eval_DOUBAN_MRR,eval_DOUBAN_mrr,eval_DOUBAN_MAP,eval_Precision1 = compute_DOUBAN(ID,scores,gold_labels) # print( # 'eval_MRR',eval_DOUBAN_MRR,eval_DOUBAN_mrr, # 'eval_MAP',eval_DOUBAN_MAP, # 'eval_Precision1',eval_Precision1) print('F1', eval_accuracy)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--from_pretrained", default='save/multitask_model/multi_task_model.bin', type=str, help="path to the fine-tuned model", ) parser.add_argument( "--config_file", default="config/bert_base_6layer_6conect.json", type=str, help="The config file which specified the model details.", ) parser.add_argument( "--output_dir", default="results", type=str, help= "The output directory where the model checkpoints will be written.", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--do_lower_case", default=True, type=bool, help= "Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus", ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument( "--num_workers", type=int, default=16, help="Number of workers in the dataloader.", ) parser.add_argument("--save_name", default="", type=str, help="save name for training.") parser.add_argument( "--use_chunk", default=0, type=float, help="whether use chunck for parallel training.", ) parser.add_argument("--tasks", default="8", type=str, help="1-2-3... training task separate by -") parser.add_argument( "--in_memory", default=False, type=bool, help="whether use chunck for parallel training.", ) parser.add_argument("--baseline", action="store_true", help="whether use single stream baseline.") parser.add_argument("--zero_shot", action="store_true", help="whether use single stream baseline.") parser.add_argument("--split", default="", type=str, help="which split to use.") parser.add_argument("--batch_size", default=1, type=int, help="which split to use.") parser.add_argument( "--clean_train_sets", default=True, type=bool, help="whether clean train sets for multitask data.", ) parser.add_argument( "--task_specific_tokens", action="store_true", help="whether to use task specific tokens for the multi-task learning.", ) args = parser.parse_args() with open("vilbert_tasks.yml", "r") as f: task_cfg = edict(yaml.safe_load(f)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.baseline: from pytorch_transformers.modeling_bert import BertConfig else: from vilbert.vilbert import BertConfig task_names = [] for i, task_id in enumerate(args.tasks.split("-")): task = "TASK" + task_id name = task_cfg[task]["name"] task_names.append(name) # timeStamp = '-'.join(task_names) + '_' + args.config_file.split('/')[1].split('.')[0] if "/" in args.from_pretrained: timeStamp = args.from_pretrained.split("/")[1] else: timeStamp = args.from_pretrained savePath = os.path.join(args.output_dir, timeStamp) config = BertConfig.from_json_file(args.config_file) bert_weight_name = json.load( open("config/" + args.bert_model + "_weight_name.json", "r")) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True if default_gpu and not os.path.exists(savePath): os.makedirs(savePath) task_batch_size, task_num_iters, task_ids, task_datasets_val, task_dataloader_val = LoadDatasetEval( args, task_cfg, args.tasks.split("-")) num_labels = max( [dataset.num_labels for dataset in task_datasets_val.values()]) if args.task_specific_tokens: config.task_specific_tokens = True config.fast_mode = True if args.zero_shot: model = BertForMultiModalPreTraining.from_pretrained( args.from_pretrained, config) else: model = VILBertForVLTasks.from_pretrained( args.from_pretrained, config=config, num_labels=num_labels, default_gpu=default_gpu, ) task_losses = LoadLosses(args, task_cfg, args.tasks.split("-")) torch.cuda.empty_cache() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, deay_allreduce=True) elif n_gpu > 1: model = nn.DataParallel(model) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] print("***** Running training *****") print(" Num Iters: ", task_num_iters) print(" Batch size: ", task_batch_size) model.eval() # when run evaluate, we run each task sequentially. for task_id in task_ids: results = [] others = [] noimg = len(task_datasets_val[task_id]._image_entries) index = int(noimg / 2) nocap = len(task_datasets_val[task_id]._caption_entries) score_matrix = np.zeros((nocap, noimg)) # 5000,1000 target_matrix = np.zeros((nocap, noimg)) rank_matrix = np.ones((nocap)) * noimg count = 0 for i, batch in enumerate(task_dataloader_val[task_id]): batch = tuple( t.cuda(device=device, non_blocking=True) for t in batch) features, spatials, image_mask, question, input_mask, segment_ids, target, caption_idx, image_idx = ( batch) task_tokens = (question.new().resize_(question.size(0), 1).fill_(int(task_id[4:]))) batch_size = features.size(0) features = features.squeeze(0) spatials = spatials.squeeze(0) image_mask = image_mask.squeeze(0) with torch.no_grad(): if args.zero_shot: vil_logit = model( question, features, spatials, segment_ids, input_mask, image_mask, task_ids=task_tokens, ) score_matrix[caption_idx, image_idx * 500:(image_idx + 1) * 500] = (torch.softmax( vil_logit, dim=1)[:, 0].view(-1).cpu().numpy()) target_matrix[caption_idx, image_idx * 500:(image_idx + 1) * 500] = ( target.view(-1).float().cpu().numpy()) else: vil_logit = model( question, features, spatials, segment_ids, input_mask, image_mask, task_ids=task_tokens, ) if image_idx == 0: score_matrix[caption_idx, image_idx * index:(image_idx + 1) * index] = ( vil_logit.view(-1).cpu().numpy()) target_matrix[ caption_idx, image_idx * index:(image_idx + 1) * index] = (target.view(-1).float().cpu().numpy()) else: score_matrix[caption_idx, image_idx * index:] = ( vil_logit.view(-1).cpu().numpy()) target_matrix[caption_idx, image_idx * index:] = ( target.view(-1).float().cpu().numpy()) if image_idx.item() == 1: _, preds = torch.max( torch.from_numpy( score_matrix[caption_idx]).unsqueeze(0), 1) rank = np.where( (np.argsort(-score_matrix[caption_idx]) == np.where( target_matrix[caption_idx] == 1)[0][0]) == 1)[0][0] rank_matrix[caption_idx] = rank rank_matrix_tmp = rank_matrix[:caption_idx + 1] r1 = 100.0 * np.sum( rank_matrix_tmp < 1) / len(rank_matrix_tmp) r5 = 100.0 * np.sum( rank_matrix_tmp < 5) / len(rank_matrix_tmp) r10 = 100.0 * np.sum( rank_matrix_tmp < 10) / len(rank_matrix_tmp) medr = np.floor(np.median(rank_matrix_tmp) + 1) meanr = np.mean(rank_matrix_tmp) + 1 print( "%d Final r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f" % (count, r1, r5, r10, medr, meanr)) results.append( np.argsort(-score_matrix[caption_idx]).tolist()[:20]) count += 1 r1 = 100.0 * np.sum(rank_matrix < 1) / len(rank_matrix) r5 = 100.0 * np.sum(rank_matrix < 5) / len(rank_matrix) r10 = 100.0 * np.sum(rank_matrix < 10) / len(rank_matrix) medr = np.floor(np.median(rank_matrix) + 1) meanr = np.mean(rank_matrix) + 1 print("************************************************") print("Final r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f" % (r1, r5, r10, medr, meanr)) print("************************************************") if args.split: json_path = os.path.join(savePath, args.split) else: json_path = os.path.join(savePath, task_cfg[task_id]["val_split"]) json.dump(results, open(json_path + "_result.json", "w")) json.dump(others, open(json_path + "_others.json", "w"))
else: # init new processor processor = CharMLMProcessor(tokenizer=tokenizer, max_seq_len=192, data_dir=data_dir) batch_size = 32 data_silo = DataSilo(processor=processor, batch_size=batch_size) # model setup config = BertConfig( vocab_size_or_config_json_file=tokenizer.vocab_size, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, ) prediction_head = CharMLMHead(hidden_size=768, vocab_size=tokenizer.vocab_size) if finetune: # load an existing model model = CharMLMAdaptiveModel.load(load_dir, device) else: # initialize a new model internal_model = BertModel(config=config) language_model = PretrainingBERT(internal_model) language_model.language = "ancient-greek"
def main(): parser = argparse.ArgumentParser() ## Required parameters(即required=True的参数必须在命令上出现) parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "数据集路径. The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="模型类型(这里为bert). Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help= "下载好的预训练模型. Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--meta_path", default=None, type=str, required=False, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "模型预测和断点文件的存放路径. The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--config_name", default="", type=str, help= "预训练的配置名字或路径. Pretrained config name or path if not the same as model_name" ) parser.add_argument( "--tokenizer_name", default="", type=str, help= "预训练分词器名字或路径. Pretrained tokenizer name or path if not the same as model_name" ) parser.add_argument( "--cache_dir", default="", type=str, help= "从亚马逊s3下载的预训练模型存放路径. Where do you want to store the pre-trained models downloaded from s3" ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "最长序列长度. The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="是否训练. Whether to run training.") parser.add_argument("--do_test", action='store_true', help="是否测试. Whether to run testing.") parser.add_argument("--predict_eval", action='store_true', help="是否预测验证集. Whether to predict eval set.") parser.add_argument("--do_eval", action='store_true', help="是否验证. Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action='store_true', help="是否训练中跑验证. Run evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="是否用小写模型. Set this flag if you are using an uncased model.") parser.add_argument( "--per_gpu_train_batch_size", default=8, type=int, help="训练时每个GPU/CPU上的batch size. Batch size per GPU/CPU for training.") parser.add_argument( "--per_gpu_eval_batch_size", default=8, type=int, help="验证时每个GPU/CPU上的batch size. Batch size per GPU/CPU for evaluation." ) parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "反向传播前梯度累计的次数. Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="Adam的初始学习率. The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="权重衰减系数. Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Adam的Epsilon系数. Epsilon for Adam optimizer.") parser.add_argument( "--max_grad_norm", default=1.0, type=float, help= " 如果所有参数的gradient组成的向量的L2 norm大于max norm,那么需要根据L2 norm/max_norm进行缩放。从而使得L2 norm小于预设的clip_norm. Max gradient norm." ) parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="训练epoch数. Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--eval_steps", default=-1, type=int, help="") parser.add_argument("--lstm_hidden_size", default=300, type=int, help="") parser.add_argument("--lstm_layers", default=2, type=int, help="") parser.add_argument("--lstm_dropout", default=0.5, type=float, help="") parser.add_argument("--train_steps", default=-1, type=int, help="") parser.add_argument("--report_steps", default=-1, type=int, help="") parser.add_argument( "--warmup_steps", default=0, type=int, help="线性warmup的steps. Linear warmup over warmup_steps.") parser.add_argument("--split_num", default=3, type=int, help="测试集划分. text split") parser.add_argument('--logging_steps', type=int, default=50, help="日志更新steps. Log every X updates steps.") parser.add_argument( '--save_steps', type=int, default=50, help="断点文件保存steps. Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "评估所有的断点. Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="不用cuda. Avoid using CUDA when available") parser.add_argument( '--overwrite_output_dir', action='store_true', help="重写输出路径. Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="重写训练和评估的缓存. Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="初始化用的随机种子. random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "是否用16位混合精度. Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "fp16的优化level. For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="为了分布式训练. For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="远程debug用的ip. For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="远程debug用的端口. For distant debugging.") parser.add_argument("--freeze", default=0, type=int, required=False, help="冻结BERT. freeze bert.") parser.add_argument("--not_do_eval_steps", default=0.35, type=float, help="not_do_eval_steps.") args = parser.parse_args() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: # 如果无指定GPU或允许使用CUDA,就使用当前所有GPU device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs # 指定使用哪个GPU(local_rank代表当前程序进程使用的GPU标号) torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging 初始化日志 logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed 设置种子数 set_seed(args) # 创建存放路径 try: os.makedirs(args.output_dir) except: pass # 载入预训练好的BERT分词器 tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case) # 载入预设好的BERT配置文件 config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=3) # Prepare model 载入并配置好基于BERT的序列分类模型 model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, args, config=config) # 开启FP16 if args.fp16: model.half() model.to(device) # 如果是指定了单个GPU,用DistributedDataParallel进行GPU训练 if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) # 如果有多个GPU,就直接用torch.nn.DataParallel,会自动调用当前可用的多个GPU elif args.n_gpu > 1: model = torch.nn.DataParallel(model) # 总batch size = GPU数量 * 每个GPU上的mbatch size args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) if args.do_train: # Prepare data loader 导入数据并准备符合格式的输入 train_examples = read_examples(os.path.join(args.data_dir, 'train.csv'), is_training=True) train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, args.split_num, True) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # 如果无指定GPU就随机采样,如果指定了GPU就分布式采样 if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) # 准备dataloader train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps) # 训练steps num_train_optimization_steps = args.train_steps # Prepare optimizer 准备优化器 param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] # no_dacay内的参数不参与权重衰减 # BN是固定C,[B,H,W]进行归一化处理(处理为均值0,方差1的正太分布上),适用于CNN # LN是固定N,[C,H,W]进行归一化处理,适用于RNN(BN适用于固定深度的前向神经网络,而RNN因输入序列长度不一致而深度不固定,因此BN不合适,而LN不依赖于batch的大小和输入sequence的深度,因此可以用于batchsize为1和RNN中对边长的输入sequence的normalize操作) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # 配置优化器和warmup机制 optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=args.train_steps // args.gradient_accumulation_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 bar = tqdm(range(num_train_optimization_steps), total=num_train_optimization_steps) train_dataloader = cycle(train_dataloader) # 循环遍历 # 先做一个eval for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=True) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data 准备验证集的dataloader eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # 开启预测模式(不用dropout和BN) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: # 将数据放在GPU上 input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) # 禁止进行梯度更新 with torch.no_grad(): tmp_eval_loss, logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) # logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) model.train() eval_loss = eval_loss / nb_eval_steps # 计算验证集的预测损失 eval_accuracy = accuracy(inference_logits, gold_labels) # 计算验证集的预测准确性 result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step } # 将验证集的预测评价写入到evel_results.txt中 output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') # 如果当前训练的模型表现最佳,则保存该模型 if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) model.train() # 分batch循环迭代训练模型 for step in bar: batch = next(train_dataloader) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) nb_tr_examples += input_ids.size(0) del input_ids, input_mask, segment_ids, label_ids if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() train_loss = round( tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4) bar.set_description("loss {}".format(train_loss)) nb_tr_steps += 1 # 用FP16去做反向传播 if args.fp16: optimizer.backward(loss) else: loss.backward() # 梯度累计后进行更新 if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() # 梯度更新 scheduler.step() # 梯度更新 optimizer.zero_grad() # 清空现有梯度,避免累计 global_step += 1 # 每隔args.eval_steps*args.gradient_accumulation_steps,打印训练过程中的结果 if (step + 1) % (args.eval_steps * args.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) # 每隔args.eval_steps*args.gradient_accumulation_steps,预测验证集并评估结果 if args.do_eval and step > num_train_optimization_steps * args.not_do_eval_steps and ( step + 1) % (args.eval_steps * args.gradient_accumulation_steps) == 0: for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] eval_examples = read_examples(os.path.join( args.data_dir, file), is_training=True) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field( eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field( eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) # logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracy(inference_logits, gold_labels) result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step, 'loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) # 预测测试集 if args.do_test: del model gc.collect() # 清理内存 args.do_train = False # 停止训练 # 载入训练好的的最佳模型文件 model = BertForSequenceClassification.from_pretrained(os.path.join( args.output_dir, "pytorch_model.bin"), args, config=config) if args.fp16: # nn.Module中的half()方法将模型中的float32转化为float16 model.half() model.to(device) # 将模型放在GPU上 # 设置GPU训练方式 if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) # 预测验证集和测试集 for file, flag in [('dev.csv', 'dev'), ('test.csv', 'test')]: inference_labels = [] gold_labels = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=False) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) print(flag, accuracy(logits, gold_labels)) # 保存预测结果文件 if flag == 'test': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df['label_2'] = logits[:, 2] df[['id', 'label_0', 'label_1', 'label_2']].to_csv(os.path.join(args.output_dir, "sub.csv"), index=False) if flag == 'dev': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df['label_2'] = logits[:, 2] df[['id', 'label_0', 'label_1', 'label_2']].to_csv(os.path.join(args.output_dir, "sub_dev.csv"), index=False) # 只预测验证集 if args.predict_eval: del model gc.collect() args.do_train = False model = BertForSequenceClassification.from_pretrained(os.path.join( args.output_dir, "pytorch_model.bin"), args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) for file, flag in [('dev.csv', 'dev')]: inference_labels = [] gold_labels = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=False) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) print(flag, accuracy(logits, gold_labels)) if flag == 'dev': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df['label_2'] = logits[:, 2] df[['id', 'label_0', 'label_1', 'label_2']].to_csv(os.path.join(args.output_dir, "sub_dev.csv"), index=False)
from pytorch_transformers.modeling_bert import BertConfig from vilbert.basebert import BaseBertForVLTasks else: from vilbert.vilbert import BertConfig from vilbert.vilbert import VILBertForVLTasks task_names = [] for i, task_id in enumerate(args.tasks.split("-")): task = "TASK" + task_id name = task_cfg[task]["name"] task_names.append(name) # timeStamp = '-'.join(task_names) + '_' + args.config_file.split('/')[1].split('.')[0] timeStamp = args.from_pretrained.split("/")[-1] + "-" + args.save_name savePath = os.path.join(args.output_dir, timeStamp) config = BertConfig.from_json_file(args.config_file) if args.task_specific_tokens: config.task_specific_tokens = True if args.local_rank == -1 or args.no_cuda: device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl")
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--meta_path", default=None, type=str, required=False, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_test", action='store_true', help="Whether to run training.") parser.add_argument("--predict_eval", action='store_true', help="Whether to predict eval set.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--eval_steps", default=-1, type=int, help="") parser.add_argument("--lstm_hidden_size", default=300, type=int, help="") parser.add_argument("--lstm_layers", default=2, type=int, help="") parser.add_argument("--lstm_dropout", default=0.5, type=float, help="") parser.add_argument("--train_steps", default=-1, type=int, help="") parser.add_argument("--report_steps", default=-1, type=int, help="") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--split_num", default=3, type=int, help="text split") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") parser.add_argument("--freeze", default=0, type=int, required=False, help="freeze bert.") parser.add_argument("--not_do_eval_steps", default=0.35, type=float, help="not_do_eval_steps.") args = parser.parse_args() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) try: os.makedirs(args.output_dir) except: pass tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case) config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=3) # Prepare model model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) if args.do_train: # Prepare data loader train_examples = read_examples(os.path.join(args.data_dir, 'train.csv'), is_training=True) train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, args.split_num, True) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps) num_train_optimization_steps = args.train_steps # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=args.train_steps // args.gradient_accumulation_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 bar = tqdm(range(num_train_optimization_steps), total=num_train_optimization_steps) train_dataloader = cycle(train_dataloader) # 先做一个eval for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=True) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) # logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracy(inference_logits, gold_labels) result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) model.train() for step in bar: batch = next(train_dataloader) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) nb_tr_examples += input_ids.size(0) del input_ids, input_mask, segment_ids, label_ids if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() train_loss = round( tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4) bar.set_description("loss {}".format(train_loss)) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 if (step + 1) % (args.eval_steps * args.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if args.do_eval and step > num_train_optimization_steps * args.not_do_eval_steps and ( step + 1) % (args.eval_steps * args.gradient_accumulation_steps) == 0: for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] eval_examples = read_examples(os.path.join( args.data_dir, file), is_training=True) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field( eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field( eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) # logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracy(inference_logits, gold_labels) result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step, 'loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) if args.do_test: del model gc.collect() args.do_train = False model = BertForSequenceClassification.from_pretrained(os.path.join( args.output_dir, "pytorch_model.bin"), args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) for file, flag in [('dev.csv', 'dev'), ('test.csv', 'test')]: inference_labels = [] gold_labels = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=False) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) print(flag, accuracy(logits, gold_labels)) if flag == 'test': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df['label_2'] = logits[:, 2] df[['id', 'label_0', 'label_1', 'label_2']].to_csv(os.path.join(args.output_dir, "sub.csv"), index=False) if flag == 'dev': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df['label_2'] = logits[:, 2] df[['id', 'label_0', 'label_1', 'label_2']].to_csv(os.path.join(args.output_dir, "sub_dev.csv"), index=False) if args.predict_eval: del model gc.collect() args.do_train = False model = BertForSequenceClassification.from_pretrained(os.path.join( args.output_dir, "pytorch_model.bin"), args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) for file, flag in [('dev.csv', 'dev')]: inference_labels = [] gold_labels = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=False) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) print(flag, accuracy(logits, gold_labels)) if flag == 'dev': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df['label_2'] = logits[:, 2] df[['id', 'label_0', 'label_1', 'label_2']].to_csv(os.path.join(args.output_dir, "sub_dev.csv"), index=False)
def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_folder_path, classification_head): """ Copy/paste/tweak roberta's weights to our BERT structure. """ roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path) roberta.eval() # disable dropout config = BertConfig( vocab_size_or_config_json_file=50265, hidden_size=roberta.args.encoder_embed_dim, num_hidden_layers=roberta.args.encoder_layers, num_attention_heads=roberta.args.encoder_attention_heads, intermediate_size=roberta.args.encoder_ffn_embed_dim, max_position_embeddings=514, type_vocab_size=1, ) if classification_head: config.num_labels = roberta.args.num_classes print("Our BERT config:", config) model = RobertaForSequenceClassification(config) if classification_head else RobertaForMaskedLM(config) model.eval() # Now let's copy all the weights. # Embeddings roberta_sent_encoder = roberta.model.decoder.sentence_encoder model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight) # just zero them out b/c RoBERTa doesn't use them. model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias model.roberta.embeddings.LayerNorm.variance_epsilon = roberta_sent_encoder.emb_layer_norm.eps for i in range(config.num_hidden_layers): # Encoder: start of layer layer: BertLayer = model.roberta.encoder.layer[i] roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[i] ### self attention self_attn: BertSelfAttention = layer.attention.self assert( roberta_layer.self_attn.in_proj_weight.shape == torch.Size((3 * config.hidden_size, config.hidden_size)) ) # we use three distinct linear layers so we split the source layer here. self_attn.query.weight.data = roberta_layer.self_attn.in_proj_weight[:config.hidden_size, :] self_attn.query.bias.data = roberta_layer.self_attn.in_proj_bias[:config.hidden_size] self_attn.key.weight.data = roberta_layer.self_attn.in_proj_weight[config.hidden_size:2*config.hidden_size, :] self_attn.key.bias.data = roberta_layer.self_attn.in_proj_bias[config.hidden_size:2*config.hidden_size] self_attn.value.weight.data = roberta_layer.self_attn.in_proj_weight[2*config.hidden_size:, :] self_attn.value.bias.data = roberta_layer.self_attn.in_proj_bias[2*config.hidden_size:] ### self-attention output self_output: BertSelfOutput = layer.attention.output assert( self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape ) self_output.dense.weight = roberta_layer.self_attn.out_proj.weight self_output.dense.bias = roberta_layer.self_attn.out_proj.bias self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias self_output.LayerNorm.variance_epsilon = roberta_layer.self_attn_layer_norm.eps ### intermediate intermediate: BertIntermediate = layer.intermediate assert( intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape ) intermediate.dense.weight = roberta_layer.fc1.weight intermediate.dense.bias = roberta_layer.fc1.bias ### output bert_output: BertOutput = layer.output assert( bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape ) bert_output.dense.weight = roberta_layer.fc2.weight bert_output.dense.bias = roberta_layer.fc2.bias bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias bert_output.LayerNorm.variance_epsilon = roberta_layer.final_layer_norm.eps #### end of layer if classification_head: model.classifier.dense.weight = roberta.model.classification_heads['mnli'].dense.weight model.classifier.dense.bias = roberta.model.classification_heads['mnli'].dense.bias model.classifier.out_proj.weight = roberta.model.classification_heads['mnli'].out_proj.weight model.classifier.out_proj.bias = roberta.model.classification_heads['mnli'].out_proj.bias else: # LM Head model.lm_head.dense.weight = roberta.model.decoder.lm_head.dense.weight model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias model.lm_head.layer_norm.variance_epsilon = roberta.model.decoder.lm_head.layer_norm.eps model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight model.lm_head.bias = roberta.model.decoder.lm_head.bias # Let's check that we get the same results. input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0) # batch of size 1 our_output = model(input_ids)[0] if classification_head: their_output = roberta.model.classification_heads['mnli'](roberta.extract_features(input_ids)) else: their_output = roberta.model(input_ids)[0] print(our_output.shape, their_output.shape) success = torch.allclose(our_output, their_output, atol=1e-3) print( "Do both models output the same tensors?", "🔥" if success else "💩" ) if not success: raise Exception("Something went wRoNg") print(f"Saving model to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path)