def train(model, loader_train, loader_valid, num_train_epochs=70, x_for_rouge=None, x_sent_align=None, optim='adam', learning_rate=3e-5, unchanged_limit=20, weights=None, ofp_fname='PLT', batch_ids=None): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) rouge_sys_sent_path = 'data.nosync/rouge_sent/' + ofp_fname + '/' rouge_sys_segs_path = 'data.nosync/rouge_segs/' + ofp_fname + '/' output_model_file = 'saved_models/' + ofp_fname output_config_file = 'saved_configs/' + ofp_fname if not os.path.exists(rouge_sys_sent_path): os.mkdir(rouge_sys_sent_path) if not os.path.exists(rouge_sys_segs_path): os.mkdir(rouge_sys_segs_path) if not os.path.exists('saved_models'): os.mkdir('saved_models') if not os.path.exists('saved_configs'): os.mkdir('saved_configs') if optim == 'sgd': optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=0.01) else: optimizer = BertAdam(model.parameters(), lr=learning_rate) model.train() loss_ls, loss_ls_s, loss_ls_qa, loss_valid_ls = [], [], [], [] qa_acc, qa_f1, sent_acc, sent_f1 = [], [], [], [] acc_loss, acc_loss_s, acc_loss_qa = [], [], [] best_qa_f1, best_sent_f1 = None, None best_valid = 1e3 unchanged = 0 if weights is not None: weights = torch.tensor([weights, 1.0], dtype=torch.float32).to(device) cur_used_ls_mean, total_used, total_s, mean_seg_len = None, None, None, None for _ in trange(num_train_epochs, desc="Epoch"): for step, batch in enumerate(tqdm(loader_train, desc="Iteration")): optimizer.zero_grad() batch = tuple(t.to(device) for t in batch) input_ids, input_mask, start_positions, end_position, sent_labels, seg_ids = batch loss, loss_s, loss_q = model(input_ids, seg_ids, input_mask, sent_labels, start_positions, end_position, weights, train=True) loss.backward() optimizer.step() acc_loss.append(loss.cpu().data.numpy()) acc_loss_s.append(loss_s.cpu().data.numpy()) acc_loss_qa.append(loss_q.cpu().data.numpy()) if (step + 1) % 10000 == 0: loss_ls.append(np.mean(acc_loss)) loss_ls_s.append(np.mean(acc_loss_s)) loss_ls_qa.append(np.mean(acc_loss_qa)) acc_loss, acc_loss_s, acc_loss_qa = [], [], [] with torch.no_grad(): eval_gt_start, eval_gt_end, eval_gt_sent = [], [], [] eval_sys_start, eval_sys_end, eval_sys_sent = [], [], [] valid_ls = [] for _, batch_valid in enumerate( tqdm(loader_valid, desc="Validation")): batch_valid = tuple( t2.to(device) for t2 in batch_valid) input_ids, input_mask, start_positions, end_position, sent_labels, seg_ids = batch_valid start_l, end_l, sent_l, valid_l = model( input_ids, seg_ids, input_mask, sent_labels, start_positions, end_position, None) eval_gt_start.extend( start_positions.cpu().data.numpy()) eval_gt_end.extend(end_position.cpu().data.numpy()) eval_gt_sent.extend(sent_labels.cpu().data.numpy()) eval_sys_start.extend(start_l.cpu().data.numpy()) eval_sys_end.extend(end_l.cpu().data.numpy()) eval_sys_sent.extend(sent_l.cpu().data.numpy()) valid_ls.append(valid_l.cpu().data.numpy()) qa_acc_val, qa_f1_val, sent_acc_val, sent_f1_val = get_valid_evaluation( eval_gt_start, eval_gt_end, eval_gt_sent, eval_sys_start, eval_sys_end, eval_sys_sent) avg_val_loss = np.mean(valid_ls) qa_acc.append(qa_acc_val) qa_f1.append(qa_f1_val) sent_acc.append(sent_acc_val) sent_f1.append(sent_f1_val) loss_valid_ls.append(avg_val_loss) if avg_val_loss < best_valid: best_valid = avg_val_loss unchanged = 0 best_qa_f1 = qa_f1_val best_sent_f1 = sent_f1_val cur_used_ls_mean, total_used, total_s, mean_seg_len, _ = create_valid_rouge( x_for_rouge, eval_sys_sent, eval_sys_start, eval_sys_end, eval_gt_sent, eval_gt_start, eval_gt_end, batch_ids, x_sent_align, rouge_sys_sent_path, rouge_sys_segs_path, ofp_fname) torch.save(model, output_model_file) elif unchanged > unchanged_limit: create_metric_figure(ofp_fname, loss_ls, loss_ls_s, loss_ls_qa, loss_valid_ls, qa_f1, sent_f1, cur_used_ls_mean, total_used, total_s, mean_seg_len, best_qa_f1, best_sent_f1) return else: unchanged += 1 create_metric_figure(ofp_fname, loss_ls, loss_ls_s, loss_ls_qa, loss_valid_ls, qa_f1, sent_f1, cur_used_ls_mean, total_used, total_s, mean_seg_len, best_qa_f1, best_sent_f1)
class DualTrainer(object): def __init__(self, qa_model_path, ca2q_model_path, c2q_model_path, c2a_model_path): self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") self.model = DualNet(qa_model_path, ca2q_model_path, c2q_model_path, c2a_model_path) train_dir = os.path.join("./save", "dual") self.save_dir = os.path.join(train_dir, "train_%d" % int(time.strftime("%m%d%H%M%S"))) # read data-set and prepare iterator self.train_loader = self.get_data_loader("./squad/train-v1.1.json") self.dev_loader = self.get_data_loader("./squad/new_dev-v1.1.json") num_train_optimization_steps = len(self.train_loader) * config.num_epochs # optimizer param_optimizer = list(self.model.qa_model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if "pooler" not in n[0]] no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] self.qa_opt = BertAdam(optimizer_grouped_parameters, lr=config.qa_lr, warmup=config.warmup_proportion, t_total=num_train_optimization_steps) params = list(self.model.ca2q_model.encoder.parameters()) \ + list(self.model.ca2q_model.decoder.parameters()) # self.qg_lr = config.lr self.qg_opt = optim.Adam(params, config.qa_lr) # assign model to device and wrap it with DataParallel torch.cuda.set_device(0) self.model.cuda() self.model = nn.DataParallel(self.model) def get_data_loader(self, file): train_examples = read_squad_examples(file, is_training=True, debug=config.debug) train_features = convert_examples_to_features(train_examples, tokenizer=self.tokenizer, max_seq_length=config.max_seq_len, max_query_length=config.max_query_len, doc_stride=128, is_training=True) all_c_ids = torch.tensor([f.c_ids for f in train_features], dtype=torch.long) all_c_lens = torch.sum(torch.sign(all_c_ids), 1) all_tag_ids = torch.tensor([f.tag_ids for f in train_features], dtype=torch.long) all_q_ids = torch.tensor([f.q_ids for f in train_features], dtype=torch.long) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) all_noq_start_positions = torch.tensor([f.noq_start_position for f in train_features], dtype=torch.long) all_noq_end_positions = torch.tensor([f.noq_end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_c_ids, all_c_lens, all_tag_ids, all_q_ids, all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions, all_noq_start_positions, all_noq_end_positions) sampler = RandomSampler(train_data) batch_size = int(config.batch_size / config.gradient_accumulation_steps) train_loader = DataLoader(train_data, sampler=sampler, batch_size=batch_size) return train_loader def save_model(self, loss, epoch): loss = round(loss, 3) dir_name = os.path.join(self.save_dir, "bert_{}_{:.3f}".format(epoch, loss)) if not os.path.exists(dir_name): os.makedirs(dir_name) # save bert model model_to_save = self.model.module.qa_model if hasattr(self.model, "module") else self.model.qa_model model_file = os.path.join(dir_name, "pytorch_model.bin") config_file = os.path.join(dir_name, "bert_config.json") state_dict = model_to_save.state_dict() torch.save(state_dict, model_file) model_to_save.config.to_json_file(config_file) # save qg model model_to_save = self.model.module.ca2q_model if hasattr(self.model, "module") else self.model.ca2q_model file = os.path.join(self.save_dir, "{}_{:.3f}".format(epoch, loss)) state_dict = { "encoder_state_dict": model_to_save.encoder.state_dict(), "decoder_state_dict": model_to_save.decoder.state_dict() } torch.save(state_dict, file) def train(self): global_step = 1 batch_num = len(self.train_loader) best_loss = 1e10 qa_loss_lst = [] qg_loss_lst = [] for epoch in range(1, config.num_epochs + 1): start = time.time() for step, batch in enumerate(self.train_loader, start=1): qa_loss, ca2q_loss = self.model(batch) # mean() to average across multiple gpu and back-propagation qa_loss = qa_loss.mean() / config.gradient_accumulation_steps ca2q_loss = ca2q_loss.mean() / config.gradient_accumulation_steps qa_loss.backward(retain_graph=True) ca2q_loss.backward() qa_loss_lst.append(qa_loss.detach().item()) qg_loss_lst.append(ca2q_loss.detach().item()) # clip gradient nn.utils.clip_grad_norm_(self.model.module.ca2q_model.parameters(), config.max_grad_norm) # update params if step % config.gradient_accumulation_steps == 0: self.qa_opt.step() self.qg_opt.step() # zero grad self.qa_opt.zero_grad() self.qg_opt.zero_grad() global_step += 1 avg_qa_loss = sum(qa_loss_lst) avg_qg_loss = sum(qg_loss_lst) # empty list qa_loss_lst = [] qg_loss_lst = [] msg = "{}/{} {} - ETA : {} - qa_loss: {:.2f}, ca2q_loss :{:.2f}" \ .format(step, batch_num, progress_bar(step, batch_num), eta(start, step, batch_num), avg_qa_loss, avg_qg_loss) print(msg, end="\r") val_qa_loss, val_qg_loss = self.evaluate(msg) if val_qg_loss <= best_loss: best_loss = val_qg_loss self.save_model(val_qg_loss, epoch) print("Epoch {} took {} - final loss : {:.4f} - qa_loss :{:.4f}, qg_loss :{:.4f}" .format(epoch, user_friendly_time(time_since(start)), ca2q_loss, val_qa_loss, val_qg_loss)) def evaluate(self, msg): self.model.module.qa_model.eval() self.model.module.ca2q_model.eval_mode() num_val_batches = len(self.dev_loader) val_qa_losses = [] val_qg_losses = [] for i, val_data in enumerate(self.dev_loader, start=1): with torch.no_grad(): val_batch_loss = self.model(val_data) qa_loss, qg_loss = val_batch_loss val_qa_losses.append(qa_loss.mean().item()) val_qg_losses.append(qg_loss.mean().item()) msg2 = "{} => Evaluating :{}/{}".format(msg, i, num_val_batches) print(msg2, end="\r") val_qa_loss = np.mean(val_qa_losses) val_qg_loss = np.mean(val_qg_losses) self.model.module.qa_model.train() self.model.module.ca2q_model.train_mode() return val_qa_loss, val_qg_loss
'best_acc_test': 0, 'best_mac_test': 0, 'best_mic_test': 0 } for epoch in range(args.max_epoch): print('-' * 20, 'Epoch {}'.format(epoch), '-' * 20) start_time = time.time() epoch_loss = [] progress = tqdm.tqdm(total=batch_num, mininterval=1, desc='Epoch: {}'.format(epoch)) for batch_idx in range(batch_num): global_step += 1 progress.update(1) optimizer.zero_grad() batch = train_set.next_batch(label_size, batch_size, drop_last=True, shuffle=True, gpu=gpu) ( elmos, labels, men_masks, ctx_masks, dists, gathers, men_ids, ) = batch loss = model.forward(elmos, labels, men_masks, ctx_masks, dists,
class MTDNNModel(MTDNNPretrainedModel): """Instance of an MTDNN Model Arguments: MTDNNPretrainedModel {BertPretrainedModel} -- Inherited from Bert Pretrained config {MTDNNConfig} -- MTDNN Configuration Object pretrained_model_name {str} -- Name of the pretrained model to initial checkpoint num_train_step {int} -- Number of steps to take each training Raises: RuntimeError: [description] ImportError: [description] Returns: MTDNNModel -- An Instance of an MTDNN Model """ def __init__( self, config: MTDNNConfig, task_defs: MTDNNTaskDefs, data_processor: MTDNNDataProcess, pretrained_model_name: str = "mtdnn-base-uncased", test_datasets_list: list = [], output_dir: str = "checkpoint", ): # Input validation assert ( config.init_checkpoint in self.supported_init_checkpoints() ), f"Initial checkpoint must be in {self.supported_init_checkpoints()}" num_train_step = data_processor.get_num_all_batches() decoder_opts = data_processor.get_decoder_options_list() task_types = data_processor.get_task_types_list() dropout_list = data_processor.get_tasks_dropout_prob_list() loss_types = data_processor.get_loss_types_list() kd_loss_types = data_processor.get_kd_loss_types_list() tasks_nclass_list = data_processor.get_task_nclass_list() # data loaders multitask_train_dataloader = data_processor.get_train_dataloader() dev_dataloaders_list = data_processor.get_dev_dataloaders() test_dataloaders_list = data_processor.get_test_dataloaders() assert decoder_opts, "Decoder options list is required!" assert task_types, "Task types list is required!" assert dropout_list, "Task dropout list is required!" assert loss_types, "Loss types list is required!" assert kd_loss_types, "KD Loss types list is required!" assert tasks_nclass_list, "Tasks nclass list is required!" assert (multitask_train_dataloader ), "DataLoader for multiple tasks cannot be None" super(MTDNNModel, self).__init__(config) # Initialize model config and update with training options self.config = config self.update_config_with_training_opts( decoder_opts, task_types, dropout_list, loss_types, kd_loss_types, tasks_nclass_list, ) wandb.init(project='mtl-uncertainty-final', entity='feifang24', config=self.config.to_dict()) self.tasks = data_processor.tasks # {task_name: task_idx} self.task_defs = task_defs self.multitask_train_dataloader = multitask_train_dataloader self.dev_dataloaders_list = dev_dataloaders_list self.test_dataloaders_list = test_dataloaders_list self.test_datasets_list = self._configure_test_ds(test_datasets_list) self.output_dir = output_dir self.batch_bald = BatchBALD(num_samples=10, num_draw=500, shuffle_prop=0.0, reverse=True, reduction='mean') self.loss_weights = [None] * self.num_tasks # Create the output_dir if it's doesn't exist MTDNNCommonUtils.create_directory_if_not_exists(self.output_dir) self.pooler = None # Resume from model checkpoint if self.config.resume and self.config.model_ckpt: assert os.path.exists( self.config.model_ckpt), "Model checkpoint does not exist" logger.info(f"loading model from {self.config.model_ckpt}") self = self.load(self.config.model_ckpt) return # Setup the baseline network # - Define the encoder based on config options # - Set state dictionary based on configuration setting # - Download pretrained model if flag is set # TODO - Use Model.pretrained_model() after configuration file is hosted. if self.config.use_pretrained_model: with MTDNNCommonUtils.download_path() as file_path: path = pathlib.Path(file_path) self.local_model_path = MTDNNCommonUtils.maybe_download( url=self. pretrained_model_archive_map[pretrained_model_name], log=logger, ) self.bert_model = MTDNNCommonUtils.load_pytorch_model( self.local_model_path) self.state_dict = self.bert_model["state"] else: # Set the config base on encoder type set for initial checkpoint if config.encoder_type == EncoderModelType.BERT: self.bert_config = BertConfig.from_dict(self.config.to_dict()) self.bert_model = BertModel.from_pretrained( self.config.init_checkpoint) self.state_dict = self.bert_model.state_dict() self.config.hidden_size = self.bert_config.hidden_size if config.encoder_type == EncoderModelType.ROBERTA: # Download and extract from PyTorch hub if not downloaded before self.bert_model = torch.hub.load("pytorch/fairseq", config.init_checkpoint) self.config.hidden_size = self.bert_model.args.encoder_embed_dim self.pooler = LinearPooler(self.config.hidden_size) new_state_dict = {} for key, val in self.bert_model.state_dict().items(): if key.startswith("model.decoder.sentence_encoder" ) or key.startswith( "model.classification_heads"): key = f"bert.{key}" new_state_dict[key] = val # backward compatibility PyTorch <= 1.0.0 if key.startswith("classification_heads"): key = f"bert.model.{key}" new_state_dict[key] = val self.state_dict = new_state_dict self.updates = (self.state_dict["updates"] if self.state_dict and "updates" in self.state_dict else 0) self.local_updates = 0 self.train_loss = AverageMeter() self.train_loss_by_task = [ AverageMeter() for _ in range(len(self.tasks)) ] self.network = SANBERTNetwork( init_checkpoint_model=self.bert_model, pooler=self.pooler, config=self.config, ) if self.state_dict: self.network.load_state_dict(self.state_dict, strict=False) self.mnetwork = (nn.DataParallel(self.network) if self.config.multi_gpu_on else self.network) self.total_param = sum([ p.nelement() for p in self.network.parameters() if p.requires_grad ]) # Move network to GPU if device available and flag set if self.config.cuda: self.network.cuda(device=self.config.cuda_device) self.optimizer_parameters = self._get_param_groups() self._setup_optim(self.optimizer_parameters, self.state_dict, num_train_step) self.para_swapped = False self.optimizer.zero_grad() self._setup_lossmap() @property def num_tasks(self): return len(self.tasks) def _configure_test_ds(self, test_datasets_list): if test_datasets_list: return test_datasets_list result = [] for task in self.task_defs.get_task_names(): if task == 'mnli': result.append('mnli_matched') result.append('mnli_mismatched') else: result.append(task) return result def _get_param_groups(self): no_decay = [ "bias", "gamma", "beta", "LayerNorm.bias", "LayerNorm.weight" ] optimizer_parameters = [ { "params": [ p for n, p in self.network.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in self.network.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] return optimizer_parameters def _setup_optim(self, optimizer_parameters, state_dict: dict = None, num_train_step: int = -1): # Setup optimizer parameters if self.config.optimizer == "sgd": self.optimizer = optim.SGD( optimizer_parameters, self.config.learning_rate, weight_decay=self.config.weight_decay, ) elif self.config.optimizer == "adamax": self.optimizer = Adamax( optimizer_parameters, self.config.learning_rate, warmup=self.config.warmup, t_total=num_train_step, max_grad_norm=self.config.grad_clipping, schedule=self.config.warmup_schedule, weight_decay=self.config.weight_decay, ) elif self.config.optimizer == "radam": self.optimizer = RAdam( optimizer_parameters, self.config.learning_rate, warmup=self.config.warmup, t_total=num_train_step, max_grad_norm=self.config.grad_clipping, schedule=self.config.warmup_schedule, eps=self.config.adam_eps, weight_decay=self.config.weight_decay, ) # The current radam does not support FP16. self.config.fp16 = False elif self.config.optimizer == "adam": self.optimizer = Adam( optimizer_parameters, lr=self.config.learning_rate, warmup=self.config.warmup, t_total=num_train_step, max_grad_norm=self.config.grad_clipping, schedule=self.config.warmup_schedule, weight_decay=self.config.weight_decay, ) else: raise RuntimeError( f"Unsupported optimizer: {self.config.optimizer}") # Clear scheduler for certain optimizer choices if self.config.optimizer in ["adam", "adamax", "radam"]: if self.config.have_lr_scheduler: self.config.have_lr_scheduler = False if state_dict and "optimizer" in state_dict: self.optimizer.load_state_dict(state_dict["optimizer"]) if self.config.fp16: try: global amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize( self.network, self.optimizer, opt_level=self.config.fp16_opt_level) self.network = model self.optimizer = optimizer if self.config.have_lr_scheduler: if self.config.scheduler_type == "rop": self.scheduler = ReduceLROnPlateau(self.optimizer, mode="max", factor=self.config.lr_gamma, patience=3) elif self.config.scheduler_type == "exp": self.scheduler = ExponentialLR(self.optimizer, gamma=self.config.lr_gamma or 0.95) else: milestones = [ int(step) for step in ( self.config.multi_step_lr or "10,20,30").split(",") ] self.scheduler = MultiStepLR(self.optimizer, milestones=milestones, gamma=self.config.lr_gamma) else: self.scheduler = None def _setup_lossmap(self): self.task_loss_criterion = [] for idx, cs in enumerate(self.config.loss_types): assert cs is not None, "Loss type must be defined." lc = LOSS_REGISTRY[cs](name=f"Loss func of task {idx}: {cs}") self.task_loss_criterion.append(lc) def _setup_kd_lossmap(self): loss_types = self.config.kd_loss_types self.kd_task_loss_criterion = [] if self.config.mkd_opt > 0: for idx, cs in enumerate(loss_types): assert cs, "Loss type must be defined." lc = LOSS_REGISTRY[cs]( name="Loss func of task {}: {}".format(idx, cs)) self.kd_task_loss_criterion.append(lc) def _to_cuda(self, tensor): # Set tensor to gpu (non-blocking) if a PyTorch tensor if tensor is None: return tensor if isinstance(tensor, list) or isinstance(tensor, tuple): y = [ e.cuda(device=self.config.cuda_device, non_blocking=True) for e in tensor ] for t in y: t.requires_grad = False else: y = tensor.cuda(device=self.config.cuda_device, non_blocking=True) y.requires_grad = False return y def train(self): if self.para_swapped: self.para_swapped = False def update(self, batch_meta, batch_data): self.network.train() target = batch_data[batch_meta["label"]] soft_labels = None task_type = batch_meta["task_type"] target = self._to_cuda(target) if self.config.cuda else target task_id = batch_meta["task_id"] inputs = batch_data[:batch_meta["input_len"]] if len(inputs) == 3: inputs.append(None) inputs.append(None) inputs.append(task_id) weight = self.loss_weights[task_id] if self.config.weighted_on: if self.config.cuda: weight = batch_data[batch_meta["factor"]].cuda( device=self.config.cuda_device, non_blocking=True) else: weight = batch_data[batch_meta["factor"]] logits = self.mnetwork(*inputs) # compute loss loss = 0 if self.task_loss_criterion[task_id] and (target is not None): loss = self.task_loss_criterion[task_id](logits, target, weight, ignore_index=-1) # compute kd loss if self.config.mkd_opt > 0 and ("soft_label" in batch_meta): soft_labels = batch_meta["soft_label"] soft_labels = (self._to_cuda(soft_labels) if self.config.cuda else soft_labels) kd_lc = self.kd_task_loss_criterion[task_id] kd_loss = (kd_lc(logits, soft_labels, weight, ignore_index=-1) if kd_lc else 0) loss = loss + kd_loss self.train_loss_by_task[task_id].update( loss.item() / (self.loss_weights[task_id] if self.loss_weights[task_id] is not None else 1.), batch_data[batch_meta["token_id"]].size(0)) self.train_loss.update(loss.item(), batch_data[batch_meta["token_id"]].size(0)) # scale loss loss = loss / (self.config.grad_accumulation_step or 1) if self.config.fp16: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() self.local_updates += 1 if self.local_updates % self.config.grad_accumulation_step == 0: if self.config.global_grad_clipping > 0: if self.config.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(self.optimizer), self.config.global_grad_clipping, ) else: torch.nn.utils.clip_grad_norm_( self.network.parameters(), self.config.global_grad_clipping) self.updates += 1 # reset number of the grad accumulation self.optimizer.step() self.optimizer.zero_grad() def eval_mode(self, data: DataLoader, metric_meta, use_cuda=True, with_label=True, label_mapper=None, task_type=TaskType.Classification): eval_loss = AverageMeter() if use_cuda: self.cuda() predictions = [] golds = [] scores = [] uncertainties = [] ids = [] metrics = {} for idx, (batch_info, batch_data) in enumerate(data): if idx % 100 == 0: logger.info(f"predicting {idx}") batch_info, batch_data = MTDNNCollater.patch_data( use_cuda, batch_info, batch_data) score, pred, gold, loss, uncertainty = self._predict_batch( batch_info, batch_data) predictions.extend(pred) golds.extend(gold) scores.extend(score) uncertainties.extend(uncertainty) ids.extend(batch_info["uids"]) eval_loss.update(loss.item(), len(batch_info["uids"])) if task_type == TaskType.Span: golds = merge_answers(ids, golds) predictions, scores = select_answers(ids, predictions, scores) if with_label: metrics = calc_metrics(metric_meta, golds, predictions, scores, label_mapper) return metrics, predictions, scores, golds, ids, ( eval_loss.avg, eval_loss.count), np.mean(uncertainties) def _predict_batch(self, batch_meta, batch_data): self.network.eval() task_id = batch_meta["task_id"] task_type = batch_meta["task_type"] inputs = batch_data[:batch_meta["input_len"]] if len(inputs) == 3: inputs.append(None) inputs.append(None) inputs.append(task_id) # get logits (and val loss if we have labels) label = batch_meta["label"] target = batch_data[label] if type(label) is int else torch.tensor( label) target = self._to_cuda(target) if self.config.cuda else target weight = None if self.config.weighted_on: if self.config.cuda: weight = batch_data[batch_meta["factor"]].cuda( device=self.config.cuda_device, non_blocking=True) else: weight = batch_data[batch_meta["factor"]] score = self.mnetwork(*inputs) if self.config.mc_dropout_samples > 0: def apply_dropout(m): if isinstance(m, DropoutWrapper): m.train() self.network.apply(apply_dropout) mc_sample_scores = torch.stack([ self.mnetwork(*inputs) for _ in range(self.config.mc_dropout_samples) ], -1) mc_sample_scores = F.softmax(mc_sample_scores, dim=1).data.cpu().numpy() uncertainty = self.batch_bald.get_uncertainties(mc_sample_scores) else: uncertainty = 1.0 loss = None if self.task_loss_criterion[task_id] and (target is not None): loss = self.task_loss_criterion[task_id](score, target, weight, ignore_index=-1) if task_type == TaskType.Ranking: score = score.contiguous().view(-1, batch_meta["pairwise_size"]) assert task_type == TaskType.Ranking score = F.softmax(score, dim=1) score = score.data.cpu() score = score.numpy() predict = np.zeros(score.shape, dtype=int) positive = np.argmax(score, axis=1) for idx, pos in enumerate(positive): predict[idx, pos] = 1 predict = predict.reshape(-1).tolist() score = score.reshape(-1).tolist() return score, predict, batch_meta["true_label"], loss elif task_type == TaskType.SequenceLabeling: mask = batch_data[batch_meta["mask"]] score = score.contiguous() score = score.data.cpu() score = score.numpy() predict = np.argmax(score, axis=1).reshape(mask.size()).tolist() valied_lenght = mask.sum(1).tolist() final_predict = [] for idx, p in enumerate(predict): final_predict.append(p[:valied_lenght[idx]]) score = score.reshape(-1).tolist() return score, final_predict, batch_meta["label"], loss elif task_type == TaskType.Span: start, end = score predictions = [] if self.config.encoder_type == EncoderModelType.BERT: scores, predictions = extract_answer( batch_meta, batch_data, start, end, self.config.get("max_answer_len", 5), ) return scores, predictions, batch_meta["answer"], loss else: if task_type == TaskType.Classification: score = F.softmax(score, dim=1) score = score.data.cpu() score = score.numpy() predict = np.argmax(score, axis=1).tolist() score = score.reshape(-1).tolist() return score, predict, batch_meta["label"], loss, uncertainty def _rerank_batches(self, batches, start_idx, task_id_to_weights, softmax_task_weights=False): def weights_to_probs(weights): if softmax_task_weights: probs = softmax(weights) else: probs = weights / np.sum(weights) return probs # reshuffle all batches; sort them by task_id new_batches = [list(self.multitask_train_dataloader) for _ in range(5)] for i in range(len(new_batches)): random.shuffle(new_batches[i]) # this line somehow helps? new_batches = [b for batches in new_batches for b in batches] # flatten task_id_by_batch = [ batch_meta["task_id"] for batch_meta, _ in new_batches ] batches_by_task = [[] for _ in range(self.num_tasks)] for batch_idx, task_id in enumerate(task_id_by_batch): batches_by_task[task_id].append(batch_idx) task_probs = weights_to_probs(task_id_to_weights) # multiply weight by num batches per task # task_probs = weights_to_probs(task_id_to_weights * np.asarray([len(batches) for batches in batches_by_task])) # comment out as see fit if self.config.uncertainty_based_weight: rel_loss_weights = (1. / task_id_to_weights) self.loss_weights = (rel_loss_weights * self.num_tasks / np.sum(rel_loss_weights)) * \ (np.mean(self.dev_loss_by_task) / self.dev_loss_by_task) # self.loss_weights = rel_loss_weights * np.mean(task_id_to_weights) num_batches = len(batches[start_idx:]) # sample num_batches many tasks w/ replacement task_indices_sampled = np.random.choice(self.num_tasks, num_batches, replace=True, p=task_probs) reranked_batches = [None] * num_batches counters = [0] * self.num_tasks for i, task_id in enumerate(task_indices_sampled): batch_idx = batches_by_task[task_id][counters[task_id] % len(batches_by_task[task_id])] counters[task_id] += 1 reranked_batches[i] = new_batches[batch_idx] weights_by_task_name = {} for task_name, task_id in self.tasks.items(): weights_by_task_name[f'task_weight/{task_name}'] = task_probs[ task_id] return [None] * start_idx + reranked_batches, weights_by_task_name def fit(self, epochs=0): """ Fit model to training datasets """ epochs = epochs or self.config.epochs logger.info(f"Total number of params: {self.total_param}") FIRST_STEP_TO_LOG = 10 for epoch in range(1, epochs + 1): logger.info(f"At epoch {epoch}") logger.info( f"Amount of data to go over: {len(self.multitask_train_dataloader)}" ) start = datetime.now() # Create batches and train batches = list(self.multitask_train_dataloader) if self.config.uncertainty_based_sampling and epoch > 1: batches, weights_by_task_name = self._rerank_batches( batches, start_idx=0, task_id_to_weights=self.smoothed_uncertainties_by_task) for idx in range(len(batches)): batch_meta, batch_data = batches[idx] batch_meta, batch_data = MTDNNCollater.patch_data( self.config.cuda, batch_meta, batch_data) task_id = batch_meta["task_id"] self.update(batch_meta, batch_data) if (self.local_updates == FIRST_STEP_TO_LOG or (self.local_updates) % (self.config.log_per_updates * self.config.grad_accumulation_step) == 0): time_left = str((datetime.now() - start) / (idx + 1) * (len(self.multitask_train_dataloader) - idx - 1)).split(".")[0] logger.info( "Updates - [{0:6}] Training Loss - [{1:.5f}] Time Remaining - [{2}]" .format(self.updates, self.train_loss.avg, time_left)) val_logs, uncertainties_by_task = self._eval( epoch, save_scores=False, eval_type='dev') test_logs, _ = self._eval(epoch, save_scores=False, eval_type='test') if self.local_updates == FIRST_STEP_TO_LOG: weights_by_task_name = { f'task_weight/{task_name}': 1.0 for task_name in self.tasks } else: if self.local_updates == self.config.log_per_updates * self.config.grad_accumulation_step: self.smoothed_uncertainties_by_task = uncertainties_by_task self.initial_train_loss_by_task = np.asarray( [loss.avg for loss in self.train_loss_by_task]) else: alpha = self.config.smooth_uncertainties self.smoothed_uncertainties_by_task = alpha * self.smoothed_uncertainties_by_task + \ (1 - alpha) * uncertainties_by_task if self.config.uncertainty_based_sampling and idx < len( batches) - 1: batches, weights_by_task_name = self._rerank_batches( batches, start_idx=idx + 1, task_id_to_weights=self. smoothed_uncertainties_by_task) if self.config.rate_based_weight: current_train_loss_by_task = np.asarray( [loss.avg for loss in self.train_loss_by_task]) rate_of_training_by_task = current_train_loss_by_task / self.initial_train_loss_by_task self.loss_weights = (rate_of_training_by_task / np.mean(rate_of_training_by_task)) * \ (np.mean(current_train_loss_by_task) / current_train_loss_by_task) self._log_training({ **val_logs, **test_logs, **weights_by_task_name }) if self.config.save_per_updates_on and ( (self.local_updates) % (self.config.save_per_updates * self.config.grad_accumulation_step) == 0): model_file = os.path.join( self.output_dir, "model_{}_{}.pt".format(epoch, self.updates), ) logger.info(f"Saving mt-dnn model to {model_file}") self.save(model_file) # Eval and save checkpoint after each epoch logger.info('=' * 5 + f' End of EPOCH {epoch} ' + '=' * 5) logger.info(f'Train loss (epoch avg): {self.train_loss.avg}') val_logs, uncertainties_by_task = self._eval(epoch, save_scores=True, eval_type='dev') test_logs, _ = self._eval(epoch, save_scores=True, eval_type='test') self._log_training({ **val_logs, **test_logs, **weights_by_task_name }) # model_file = os.path.join(self.output_dir, "model_{}.pt".format(epoch)) # logger.info(f"Saving mt-dnn model to {model_file}") # self.save(model_file) def _eval(self, epoch, save_scores, eval_type='dev'): if eval_type not in {'dev', 'test'}: raise ValueError( "eval_type must be one of the following: 'dev' or 'test'.") is_dev = eval_type == 'dev' log_dict = {} loss_agg = AverageMeter() loss_by_task = {} uncertainties_by_task = {} for idx, dataset in enumerate(self.test_datasets_list): logger.info( f"Evaluating on {eval_type} ds {idx}: {dataset.upper()}") prefix = dataset.split("_")[0] results = self._predict(idx, prefix, dataset, eval_type=eval_type, saved_epoch_idx=epoch, save_scores=save_scores) avg_loss = results['avg_loss'] num_samples = results['num_samples'] loss_agg.update(avg_loss, n=num_samples) loss_by_task[dataset] = avg_loss if is_dev: logger.info( f"Task {dataset} -- {eval_type} loss: {avg_loss:.3f}") metrics = results['metrics'] for key, val in metrics.items(): if is_dev: logger.info( f"Task {dataset} -- {eval_type} {key}: {val:.3f}") log_dict[f'{dataset}/{eval_type}_{key}'] = val uncertainty = results['uncertainty'] if is_dev: logger.info( f"Task {dataset} -- {eval_type} uncertainty: {uncertainty:.3f}" ) log_dict[ f'{eval_type}_uncertainty_by_task/{dataset}'] = uncertainty if prefix not in uncertainties_by_task: uncertainties_by_task[prefix] = uncertainty else: # exploiting the fact that only mnli has two dev sets uncertainties_by_task[prefix] += uncertainty uncertainties_by_task[prefix] /= 2 if is_dev: logger.info(f'{eval_type} loss: {loss_agg.avg}') log_dict[f'{eval_type}_loss'] = loss_agg.avg log_dict.update({ f'{eval_type}_loss_by_task/{task}': loss for task, loss in loss_by_task.items() }) loss_by_task_id = [None] * self.num_tasks for task_name, loss in loss_by_task.items(): loss_by_task_id[self.tasks[task_name]] = loss loss_by_task_id = np.asarray(loss_by_task_id) if is_dev: self.dev_loss_by_task = loss_by_task_id else: self.test_loss_by_task = loss_by_task_id # convert uncertainties_by_task from dict to list, where list[i] = weight of task_id i uncertainties_by_task_id = [None] * self.num_tasks for task_name, weight in uncertainties_by_task.items(): task_id = self.tasks[task_name] uncertainties_by_task_id[task_id] = weight uncertainties_by_task_id = np.asarray(uncertainties_by_task_id) return log_dict, uncertainties_by_task_id def _log_training(self, val_logs): train_loss_by_task = { f'train_loss_by_task/{task}': self.train_loss_by_task[task_idx].avg for task, task_idx in self.tasks.items() } train_loss_agg = {'train_loss': self.train_loss.avg} loss_weights_by_task = {} if self.config.uncertainty_based_weight or self.config.rate_based_weight: for task_name, task_id in self.tasks.items(): loss_weights_by_task[ f'loss_weight/{task_name}'] = self.loss_weights[ task_id] if self.loss_weights[ task_id] is not None else 1. log_dict = { 'global_step': self.updates, **train_loss_by_task, **train_loss_agg, **val_logs, **loss_weights_by_task } wandb.log(log_dict) def _predict(self, eval_ds_idx, eval_ds_prefix, eval_ds_name, eval_type='dev', saved_epoch_idx=None, save_scores=True): if eval_type not in {'dev', 'test'}: raise ValueError( "eval_type must be one of the following: 'dev' or 'test'.") is_dev = eval_type == 'dev' label_dict = self.task_defs.global_map.get(eval_ds_prefix, None) if is_dev: data: DataLoader = self.dev_dataloaders_list[eval_ds_idx] else: data: DataLoader = self.test_dataloaders_list[eval_ds_idx] if data is None: results = None else: with torch.no_grad(): ( metrics, predictions, scores, golds, ids, (eval_ds_avg_loss, eval_ds_num_samples), uncertainty, ) = self.eval_mode( data, metric_meta=self.task_defs.metric_meta_map[eval_ds_prefix], use_cuda=self.config.cuda, with_label=True, label_mapper=label_dict, task_type=self.task_defs.task_type_map[eval_ds_prefix]) results = { "metrics": metrics, "predictions": predictions, "uids": ids, "scores": scores, "uncertainty": uncertainty } if save_scores: score_file_prefix = f"{eval_ds_name}_{eval_type}_scores" \ + (f'_{saved_epoch_idx}' if saved_epoch_idx is not None else "") score_file = os.path.join(self.output_dir, score_file_prefix + ".json") MTDNNCommonUtils.dump(score_file, results) if self.config.use_glue_format: official_score_file = os.path.join( self.output_dir, score_file_prefix + ".tsv") submit(official_score_file, results, label_dict) results.update({ "avg_loss": eval_ds_avg_loss, "num_samples": eval_ds_num_samples }) return results def predict(self, trained_model_chckpt: str = None): """ Inference of model on test datasets """ # Load a trained checkpoint if a valid model checkpoint if trained_model_chckpt and gfile.exists(trained_model_chckpt): logger.info( f"Running predictions using: {trained_model_chckpt}. This may take 3 minutes." ) self.load(trained_model_chckpt) logger.info("Checkpoint loaded.") self.config.batch_size_eval = 128 self.config.use_glue_format = True # test eval for idx, dataset in enumerate(self.test_datasets_list): prefix = dataset.split("_")[0] results = self._predict(idx, prefix, dataset, eval_type='test') if results: logger.info(f"[new test scores saved for {dataset}.]") else: logger.info(f"Data not found for {dataset}.") def extract(self, batch_meta, batch_data): self.network.eval() # 'token_id': 0; 'segment_id': 1; 'mask': 2 inputs = batch_data[:3] all_encoder_layers, pooled_output = self.mnetwork.bert(*inputs) return all_encoder_layers, pooled_output def save(self, filename): network_state = dict([(k, v.cpu()) for k, v in self.network.state_dict().items()]) params = { "state": network_state, "optimizer": self.optimizer.state_dict(), "config": self.config, } torch.save(params, gfile.GFile(filename, mode='wb')) logger.info("model saved to {}".format(filename)) def load(self, checkpoint): model_state_dict = torch.load(gfile.GFile(checkpoint, mode='rb')) self.network.load_state_dict(model_state_dict["state"], strict=False) self.optimizer.load_state_dict(model_state_dict["optimizer"]) self.config = model_state_dict["config"] def cuda(self): self.network.cuda(device=self.config.cuda_device) def supported_init_checkpoints(self): """List of allowed check points """ return [ "bert-base-uncased", "bert-base-cased", "bert-large-uncased", "mtdnn-base-uncased", "mtdnn-large-uncased", "roberta.base", "roberta.large", ] def update_config_with_training_opts( self, decoder_opts, task_types, dropout_list, loss_types, kd_loss_types, tasks_nclass_list, ): # Update configurations with options obtained from preprocessing training data setattr(self.config, "decoder_opts", decoder_opts) setattr(self.config, "task_types", task_types) setattr(self.config, "tasks_dropout_p", dropout_list) setattr(self.config, "loss_types", loss_types) setattr(self.config, "kd_loss_types", kd_loss_types) setattr(self.config, "tasks_nclass_list", tasks_nclass_list)
def finetune(self, clean_file, corrupt_file, data_dir="", validation_split=0.2, n_epochs=2, new_vocab_list=[]): if new_vocab_list: raise NotImplementedError("Do not currently support modifying output vocabulary of the models") # load data and split in train-validation data_dir = DEFAULT_TRAINTEST_DATA_PATH if data_dir == "default" else data_dir train_data = load_data(data_dir, clean_file, corrupt_file) train_data, valid_data = train_validation_split(train_data, 0.8, seed=11690) print("len of train and test data: ", len(train_data), len(valid_data)) # load vocab and model self.__model_status() # finetune ############################################# # training and validation ############################################# model, vocab = self.model, self.vocab TRAIN_BATCH_SIZE, VALID_BATCH_SIZE = 16, 32 GRADIENT_ACC = 4 DEVICE = self.device START_EPOCH, N_EPOCHS = 0, n_epochs CHECKPOINT_PATH = os.path.join(data_dir, "new_models", os.path.split(self.bert_pretrained_name_or_path)[-1]) if os.path.exists(CHECKPOINT_PATH): num = 1 while True: NEW_CHECKPOINT_PATH = CHECKPOINT_PATH + f"-{num}" if not os.path.exists(NEW_CHECKPOINT_PATH): break num += 1 CHECKPOINT_PATH = NEW_CHECKPOINT_PATH VOCAB_PATH = os.path.join(CHECKPOINT_PATH, "vocab.pkl") if not os.path.exists(CHECKPOINT_PATH): os.makedirs(CHECKPOINT_PATH) print(f"CHECKPOINT_PATH: {CHECKPOINT_PATH}") # running stats max_dev_acc, argmax_dev_acc = -1, -1 patience = 100 # Create an optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] t_total = int(len(train_data) / TRAIN_BATCH_SIZE / GRADIENT_ACC * N_EPOCHS) if t_total == 0: t_total = 1 optimizer = BertAdam(optimizer_grouped_parameters, lr=5e-5, warmup=0.1, t_total=t_total) # model to device model.to(DEVICE) # load parameters if not training from scratch if START_EPOCH > 1: progress_write_file = ( open(os.path.join(CHECKPOINT_PATH, f"progress_retrain_from_epoch{START_EPOCH}.txt"), 'w') ) model, optimizer, max_dev_acc, argmax_dev_acc = load_pretrained(model, CHECKPOINT_PATH, optimizer=optimizer) progress_write_file.write(f"Training model params after loading from path: {CHECKPOINT_PATH}\n") else: progress_write_file = open(os.path.join(CHECKPOINT_PATH, "progress.txt"), 'w') print(f"Training model params") progress_write_file.write(f"Training model params\n") progress_write_file.flush() # train and eval for epoch_id in range(START_EPOCH, N_EPOCHS + 1): # check for patience if (epoch_id - argmax_dev_acc) > patience: print("patience count reached. early stopping initiated") print("max_dev_acc: {}, argmax_dev_acc: {}".format(max_dev_acc, argmax_dev_acc)) break # print epoch print(f"In epoch: {epoch_id}") progress_write_file.write(f"In epoch: {epoch_id}\n") progress_write_file.flush() # train loss and backprop train_loss = 0. train_acc = 0. train_acc_count = 0. print("train_data size: {}".format(len(train_data))) progress_write_file.write("train_data size: {}\n".format(len(train_data))) progress_write_file.flush() train_data_iter = batch_iter(train_data, batch_size=TRAIN_BATCH_SIZE, shuffle=True) nbatches = int(np.ceil(len(train_data) / TRAIN_BATCH_SIZE)) optimizer.zero_grad() for batch_id, (batch_labels, batch_sentences) in enumerate(train_data_iter): st_time = time.time() # set batch data for bert batch_labels_, batch_sentences_, batch_bert_inp, batch_bert_splits = \ bert_tokenize_for_valid_examples(batch_labels, batch_sentences, self.bert_pretrained_name_or_path) if len(batch_labels_) == 0: print("################") print("Not training the following lines due to pre-processing mismatch: \n") print([(a, b) for a, b in zip(batch_labels, batch_sentences)]) print("################") continue else: batch_labels, batch_sentences = batch_labels_, batch_sentences_ batch_bert_inp = {k: v.to(DEVICE) for k, v in batch_bert_inp.items()} # set batch data for others batch_labels, batch_lengths = labelize(batch_labels, vocab) # batch_lengths = batch_lengths.to(device) batch_labels = batch_labels.to(DEVICE) # forward model.train() loss = model(batch_bert_inp, batch_bert_splits, targets=batch_labels) batch_loss = loss.cpu().detach().numpy() train_loss += batch_loss # backward if GRADIENT_ACC > 1: loss = loss / GRADIENT_ACC loss.backward() # step if (batch_id + 1) % GRADIENT_ACC == 0 or batch_id >= nbatches - 1: # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() # scheduler.step() optimizer.zero_grad() # compute accuracy in numpy if batch_id % 10000 == 0: train_acc_count += 1 model.eval() with torch.no_grad(): _, batch_predictions = model(batch_bert_inp, batch_bert_splits, targets=batch_labels) model.train() batch_labels = batch_labels.cpu().detach().numpy() batch_lengths = batch_lengths.cpu().detach().numpy() ncorr, ntotal = batch_accuracy_func(batch_predictions, batch_labels, batch_lengths) batch_acc = ncorr / ntotal train_acc += batch_acc # update progress progressBar(batch_id + 1, int(np.ceil(len(train_data) / TRAIN_BATCH_SIZE)), ["batch_time", "batch_loss", "avg_batch_loss", "batch_acc", "avg_batch_acc"], [time.time() - st_time, batch_loss, train_loss / (batch_id + 1), batch_acc, train_acc / train_acc_count]) if batch_id == 0 or (batch_id + 1) % 5000 == 0: nb = int(np.ceil(len(train_data) / TRAIN_BATCH_SIZE)) progress_write_file.write(f"{batch_id + 1}/{nb}\n") progress_write_file.write( f"batch_time: {time.time() - st_time}, avg_batch_loss: {train_loss / (batch_id + 1)}, " f"avg_batch_acc: {train_acc / train_acc_count}\n") progress_write_file.flush() print(f"\nEpoch {epoch_id} train_loss: {train_loss / (batch_id + 1)}") # valid loss valid_loss = 0. valid_acc = 0. print("valid_data size: {}".format(len(valid_data))) progress_write_file.write("valid_data size: {}\n".format(len(valid_data))) progress_write_file.flush() valid_data_iter = batch_iter(valid_data, batch_size=VALID_BATCH_SIZE, shuffle=False) for batch_id, (batch_labels, batch_sentences) in enumerate(valid_data_iter): st_time = time.time() # set batch data for bert batch_labels_, batch_sentences_, batch_bert_inp, batch_bert_splits = \ bert_tokenize_for_valid_examples(batch_labels, batch_sentences, self.bert_pretrained_name_or_path) if len(batch_labels_) == 0: print("################") print("Not validating the following lines due to pre-processing mismatch: \n") print([(a, b) for a, b in zip(batch_labels, batch_sentences)]) print("################") continue else: batch_labels, batch_sentences = batch_labels_, batch_sentences_ batch_bert_inp = {k: v.to(DEVICE) for k, v in batch_bert_inp.items()} # set batch data for others batch_labels, batch_lengths = labelize(batch_labels, vocab) # batch_lengths = batch_lengths.to(device) batch_labels = batch_labels.to(DEVICE) # forward model.eval() with torch.no_grad(): batch_loss, batch_predictions = model(batch_bert_inp, batch_bert_splits, targets=batch_labels) model.train() valid_loss += batch_loss # compute accuracy in numpy batch_labels = batch_labels.cpu().detach().numpy() batch_lengths = batch_lengths.cpu().detach().numpy() ncorr, ntotal = batch_accuracy_func(batch_predictions, batch_labels, batch_lengths) batch_acc = ncorr / ntotal valid_acc += batch_acc # update progress progressBar(batch_id + 1, int(np.ceil(len(valid_data) / VALID_BATCH_SIZE)), ["batch_time", "batch_loss", "avg_batch_loss", "batch_acc", "avg_batch_acc"], [time.time() - st_time, batch_loss, valid_loss / (batch_id + 1), batch_acc, valid_acc / (batch_id + 1)]) if batch_id == 0 or (batch_id + 1) % 2000 == 0: nb = int(np.ceil(len(valid_data) / VALID_BATCH_SIZE)) progress_write_file.write(f"{batch_id}/{nb}\n") progress_write_file.write( f"batch_time: {time.time() - st_time}, avg_batch_loss: {valid_loss / (batch_id + 1)}, " f"avg_batch_acc: {valid_acc / (batch_id + 1)}\n") progress_write_file.flush() print(f"\nEpoch {epoch_id} valid_loss: {valid_loss / (batch_id + 1)}") # save model, optimizer and test_predictions if val_acc is improved if valid_acc >= max_dev_acc: print(f"validation accuracy improved from {max_dev_acc:.4f} to {valid_acc:.4f}") # name = "model.pth.tar".format(epoch_id) # torch.save({ # 'epoch_id': epoch_id, # 'max_dev_acc': max_dev_acc, # 'argmax_dev_acc': argmax_dev_acc, # 'model_state_dict': model.state_dict(), # 'optimizer_state_dict': optimizer.state_dict()}, # os.path.join(CHECKPOINT_PATH, name)) name = "pytorch_model.bin" torch.save(model.state_dict(), os.path.join(CHECKPOINT_PATH, name)) print("Model saved at {} in epoch {}".format(os.path.join(CHECKPOINT_PATH, name), epoch_id)) save_vocab_dict(VOCAB_PATH, vocab) # re-assign max_dev_acc, argmax_dev_acc = valid_acc, epoch_id print(f"Model and logs saved at {CHECKPOINT_PATH}") return
def train_bert_cased(t_config, p_config, s_config): device = torch.device('cuda') seed_everything(s_config.seed) train = pd.read_csv('../input/train.csv').sample( t_config.num_to_load + t_config.valid_size, random_state=s_config.seed) train = prepare_train_text(train, p_config) train = train.fillna(0) tokenizer = BertTokenizer.from_pretrained('bert-base-cased') train_processed = get_tokenized_samples(t_config.MAX_SEQUENCE_LENGTH, tokenizer, train['text_proc']) sequences = train_processed lengths = np.argmax(sequences == 0, axis=1) lengths[lengths == 0] = sequences.shape[1] MyModel = BertForSequenceClassification.from_pretrained( 'bert-base-cased', num_labels=t_config.num_labels) MyModel.to(device) # Prepare target target_train = train['target'].values[:t_config.num_to_load] target_train_aux = train[[ 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat' ]].values[:t_config.num_to_load] target_train_identity = train[identity_columns].values[:t_config. num_to_load] target_val = train['target'].values[t_config.num_to_load:] target_val_aux = train[[ 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat' ]].values[t_config.num_to_load:] target_val_identity = train[identity_columns].values[t_config.num_to_load:] # Prepare training data inputs_train = train_processed[:t_config.num_to_load] inputs_val = train_processed[t_config.num_to_load:] weight_train = train['weight'].values[:t_config.num_to_load] weight_val = train['weight'].values[t_config.num_to_load:] lengths_train = lengths[:t_config.num_to_load] lengths_val = lengths[t_config.num_to_load:] inputs_train = torch.tensor(inputs_train, dtype=torch.int64) Target_train = torch.Tensor(target_train) Target_train_aux = torch.Tensor(target_train_aux) Target_train_identity = torch.Tensor(target_train_identity) weight_train = torch.Tensor(weight_train) Lengths_train = torch.tensor(lengths_train, dtype=torch.int64) inputs_val = torch.tensor(inputs_val, dtype=torch.int64) Target_val = torch.Tensor(target_val) Target_val_aux = torch.Tensor(target_val_aux) Target_val_identity = torch.Tensor(target_val_identity) weight_val = torch.Tensor(weight_val) Lengths_val = torch.tensor(lengths_val, dtype=torch.int64) # Prepare dataset train_dataset = data.TensorDataset(inputs_train, Target_train, Target_train_aux, Target_train_identity, weight_train, Lengths_train) val_dataset = data.TensorDataset(inputs_val, Target_val, Target_val_aux, Target_val_identity, weight_val, Lengths_val) # Bucket sequencing ids_train = lengths_train.argsort(kind="stable") ids_train_new = resort_index(ids_train, t_config.num_of_bucket, s_config.seed) train_loader = torch.utils.data.DataLoader(data.Subset( train_dataset, ids_train_new), batch_size=t_config.batch_size, collate_fn=clip_to_max_len, shuffle=False) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in list(MyModel.named_parameters()) if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in list(MyModel.named_parameters()) if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=t_config.learning_rate, betas=[0.9, 0.999], warmup=t_config.warmup, t_total=t_config.num_epoch * len(train_loader) // t_config.accumulation_steps) i = 0 for n, p in list(MyModel.named_parameters()): if i < 10: p.requires_grad = False i += 1 p = train['target'].mean() likelihood = np.log(p / (1 - p)) model_bias = torch.tensor(likelihood).type(torch.float) MyModel.classifier.bias = nn.Parameter(model_bias.to(device)) MyModel, optimizer = amp.initialize(MyModel, optimizer, opt_level="O1", verbosity=0) for epoch in range(t_config.num_epoch): i = 0 print('Training start') optimizer.zero_grad() MyModel.train() for batch_idx, (input, target, target_aux, target_identity, sample_weight) in tqdm_notebook( enumerate(train_loader), total=len(train_loader)): y_pred = MyModel( input.to(device), attention_mask=(input > 0).to(device), ) loss = F.binary_cross_entropy_with_logits(y_pred[0][:, 0], target.to(device), reduction='none') loss = (loss * sample_weight.to(device)).sum() / ( sample_weight.sum().to(device)) loss_aux = F.binary_cross_entropy_with_logits( y_pred[0][:, 1:6], target_aux.to(device), reduction='none').mean(axis=1) loss_aux = (loss_aux * sample_weight.to(device)).sum() / ( sample_weight.sum().to(device)) loss += loss_aux if t_config.num_labels == 15: loss_identity = F.binary_cross_entropy_with_logits( y_pred[0][:, 6:], target_identity.to(device), reduction='none').mean(axis=1) loss_identity = (loss_identity * sample_weight.to(device) ).sum() / (sample_weight.sum().to(device)) loss += loss_identity # Use apex for better gradients and smaller model sizes with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() # Use accumulation steps to tune the effective batch size of training if (i + 1) % t_config.accumulation_steps == 0: optimizer.step() optimizer.zero_grad() i += 1 torch.save( { 'model_state_dict': MyModel.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), }, f'{t_config.PATH}')
def run_train(self, train, dev): if not os.path.isdir(self.args.dsave): os.makedirs(self.args.dsave) logger = logging.getLogger(self.__class__.__name__) logger.setLevel(logging.DEBUG) fh = logging.FileHandler(os.path.join(self.args.dsave, 'train.log')) fh.setLevel(logging.CRITICAL) logger.addHandler(fh) ch = logging.StreamHandler() ch.setLevel(logging.CRITICAL) logger.addHandler(ch) num_train_steps = int(len(train) / self.args.train_batch * self.args.epoch) # remove pooler param_optimizer = list(self.named_parameters()) param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, ] optimizer = BertAdam(optimizer_grouped_parameters, lr=self.args.learning_rate, warmup=self.args.warmup, t_total=num_train_steps) print('num_train', len(train)) print('num_dev', len(dev)) global_step = 0 best_metrics = {self.args.early_stop: -float('inf')} for epoch in trange(self.args.epoch, desc='epoch',): self.epoch = epoch train = train[:] np.random.shuffle(train) train_stats = defaultdict(list) gates = [] preds = [] self.train() for i in trange(0, len(train), self.args.train_batch, desc='batch'): actual_train_batch = int(self.args.train_batch / self.args.gradient_accumulation_steps) batch_stats = defaultdict(list) batch = train[i: i + self.args.train_batch] for accu_i in range(0, len(batch), actual_train_batch): actual_batch = batch[accu_i : accu_i + actual_train_batch] out = self(actual_batch) gates.extend(out['gate']) pred = self.extract_preds(out, actual_batch) loss = self.compute_loss(out, actual_batch) for k, v in loss.items(): loss[k] = v / self.args.gradient_accumulation_steps batch_stats[k].append(v.item()/ self.args.gradient_accumulation_steps) sum(loss.values()).backward() preds += pred lr_this_step = self.args.learning_rate * warmup_linear(global_step/num_train_steps, self.args.warmup) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 for k in batch_stats.keys(): train_stats['loss_' + k].append(sum(batch_stats[k])) if global_step % self.args.eval_every_steps == 0: dev_stats = defaultdict(list) dev_preds, dev_gates = self.run_pred(dev) dev_metrics = {k: sum(v) / len(v) for k, v in dev_stats.items()} dev_metrics.update(self.compute_metrics(dev_preds, dev)) dev_metrics.update({'gate_avg': mean(dev_gates)}) dev_metrics.update({'gate_std': stdev(dev_gates)}) metrics = {'global_step': global_step} # metrics.update({'train_' + k: v for k, v in train_metrics.items()}) metrics.update({'dev_' + k: v for k, v in dev_metrics.items()}) logger.critical(pformat(metrics)) if metrics[self.args.early_stop] > best_metrics[self.args.early_stop]: logger.critical('Found new best! Saving to ' + self.args.dsave) best_metrics = metrics self.save(best_metrics, self.args.dsave, self.args.early_stop) with open(os.path.join(self.args.dsave, 'dev.preds.json'), 'wt') as f: json.dump(dev_preds, f, indent=2) with open(os.path.join(self.args.dsave, 'dev.best_metrics.json'), 'wt') as f: json.dump(best_metrics, f, indent=2) self.train() train_metrics = {k: sum(v) / len(v) for k, v in train_stats.items()} train_metrics.update(self.compute_metrics(preds, train)) train_metrics.update({'gate_avg': mean(gates)}) train_metrics.update({'gate_std': stdev(gates)}) dev_stats = defaultdict(list) dev_preds, dev_gates = self.run_pred(dev) dev_metrics = {k: sum(v) / len(v) for k, v in dev_stats.items()} dev_metrics.update(self.compute_metrics(dev_preds, dev)) dev_metrics.update({'gate_avg': mean(dev_gates)}) dev_metrics.update({'gate_std': stdev(dev_gates)}) metrics = {'global_step': global_step} metrics.update({'train_' + k: v for k, v in train_metrics.items()}) metrics.update({'dev_' + k: v for k, v in dev_metrics.items()}) logger.critical(pformat(metrics)) if metrics[self.args.early_stop] > best_metrics[self.args.early_stop]: logger.critical('Found new best! Saving to ' + self.args.dsave) best_metrics = metrics self.save(best_metrics, self.args.dsave, self.args.early_stop) with open(os.path.join(self.args.dsave, 'dev.preds.json'), 'wt') as f: json.dump(dev_preds, f, indent=2) with open(os.path.join(self.args.dsave, 'dev.best_metrics.json'), 'wt') as f: json.dump(best_metrics, f, indent=2) logger.critical('Best dev') logger.critical(pformat(best_metrics))
def train(self): with open(os.path.join(self.results_folder, "log.txt"), "w") as f_log: for train, test in LeaveOneOut().split(self.dfs): train_set = [self.dfs[i] for i in train] test_set = self.dfs[test[0]] # Create sentence and label lists sentences_list = [] labels_list = [] for i, book in enumerate(train_set): sentences_list.extend(book.sentence.values) labels_list.extend(book.label.values) f_log.write("Length book: " + str(len(sentences_list[i])) + '\n') f_log.write("Sentences: " + str(len(sentences_list)) + ", labels:" + str(len(labels_list)) + '\n') MAX_LEN = 128 # We need to add special tokens at the beginning and end of each sentence for BERT to work properly sentences_train = [ self.tokenizer.encode_plus(sent, add_special_tokens=True, max_length=MAX_LEN) for i, sent in enumerate(sentences_list) ] le = LabelEncoder() labels_train = labels_list f_log.write(str(labels_train[:10]) + '\n') f_log.write('Analyze labels' + '\n') le.fit(labels_train) le_name_mapping = dict( zip(le.classes_, le.transform(le.classes_))) f_log.write(str(le_name_mapping) + '\n') labels_train = le.fit_transform(labels_train) # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary input_ids_train = [ inputs["input_ids"] for inputs in sentences_train ] # Pad our input tokens input_ids_train = pad_sequences(input_ids_train, maxlen=MAX_LEN, truncating="post", padding="post") # Create attention masks attention_masks_train = [] # Create a mask of 1s for each token followed by 0s for padding for seq in input_ids_train: seq_mask_train = [float(i > 0) for i in seq] attention_masks_train.append(seq_mask_train) # Use train_test_split to split our data into train and validation sets for training train_inputs, train_labels = input_ids_train, labels_train train_masks, _ = attention_masks_train, input_ids_train # Convert all of our data into torch tensors, the required datatype for our model train_inputs = torch.tensor(train_inputs).to(torch.int64) train_labels = torch.tensor(train_labels).to(torch.int64) train_masks = torch.tensor(train_masks).to(torch.int64) batch_size = 32 # Create an iterator of our data with torch DataLoader. This helps save on memory during training # because, unlike a for loop, with an iterator the entire dataset does not need to be loaded into # memory train_data = TensorDataset(train_inputs, train_masks, train_labels) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) torch.cuda.empty_cache() # BINARY CLASSIFIER model = BertForSequenceClassification.from_pretrained( "bert-base-uncased", num_labels=2) model.cuda() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.0 }] # This variable contains all of the hyperparemeter information our training loop needs optimizer = BertAdam(optimizer_grouped_parameters, lr=2e-5, warmup=.1) train_loss_set = [] # Number of training epochs (authors recommend between 2 and 4) epochs = 10 device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") torch.cuda.get_device_name(0) for _ in trange(epochs, desc="Epoch"): # Training # Set our model to training mode (as opposed to evaluation mode) model.train() # Tracking variables tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 # Train the data for one epoch for step, batch in enumerate(train_dataloader): # Add batch to GPU batch = tuple(t.to(device) for t in batch) # Unpack the inputs from our dataloader b_input_ids, b_input_mask, b_labels = batch # Clear out the gradients (by default they accumulate) optimizer.zero_grad() # Forward pass loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) train_loss_set.append(loss.item()) # Backward pass loss.backward() # Update parameters and take a step using the computed gradient optimizer.step() # Update tracking variables tr_loss += loss.item() nb_tr_examples += b_input_ids.size(0) nb_tr_steps += 1 f_log.write("Train loss: {}".format(tr_loss / nb_tr_steps) + '\n') plt.figure(figsize=(15, 8)) plt.title("Training loss") plt.xlabel("Batch") plt.ylabel("Loss") plt.plot(train_loss_set) plt.savefig(self.img_folder + 'train' + str(test[0]) + '.png') model_to_save = model WEIGHTS_NAME = "BERT_Novel_test" + str(test[0]) + ".bin" OUTPUT_DIR = self.models_folder output_model_file = os.path.join(OUTPUT_DIR, WEIGHTS_NAME) f_log.write(str(output_model_file) + '\n') torch.save(model_to_save.state_dict(), output_model_file) state_dict = torch.load(output_model_file) model.load_state_dict(state_dict) sentences6 = test_set.sentence.values f_log.write(str(len(sentences6)) + '\n') labels6 = test_set.label.values labels_test = labels6 sentences11 = sentences6 sentences_test = [ self.tokenizer.encode_plus(sent, add_special_tokens=True, max_length=MAX_LEN) for i, sent in enumerate(sentences11) ] f_log.write('Analyze labels test' + '\n') le.fit(labels_test) le_name_mapping = dict( zip(le.classes_, le.transform(le.classes_))) f_log.write(str(le_name_mapping) + '\n') labels_test = le.fit_transform(labels_test) MAX_LEN = 128 # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary input_ids1 = [inputs["input_ids"] for inputs in sentences_test] # Pad our input tokens input_ids1 = pad_sequences(input_ids1, maxlen=MAX_LEN, truncating="post", padding="post") # Create attention masks attention_masks1 = [] # Create a mask of 1s for each token followed by 0s for padding for seq in input_ids1: seq_mask1 = [float(i > 0) for i in seq] attention_masks1.append(seq_mask1) f_log.write(str(len(attention_masks1[0])) + '\n') prediction_inputs = torch.tensor(input_ids1).to(torch.int64) prediction_masks = torch.tensor(attention_masks1).to( torch.int64) prediction_labels = torch.tensor(labels_test).to(torch.int64) batch_size = 32 prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels) prediction_sampler = SequentialSampler(prediction_data) prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size) # Prediction on test set # Put model in evaluation mode model.eval() # Tracking variables predictions, true_labels = [], [] # Predict for batch in prediction_dataloader: # Add batch to GPU batch = tuple(t.to(device) for t in batch) # Unpack the inputs from our dataloader b_input_ids, b_input_mask, b_labels = batch # Telling the model not to compute or store gradients, saving memory and speeding up prediction with torch.no_grad(): # Forward pass, calculate logit predictions logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) # Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() # Store predictions and true labels predictions.append(logits) true_labels.append(label_ids) f_log.write( str(len(predictions)) + ' ' + str(len(true_labels)) + '\n') f_log.write(str(predictions[0][0]) + '\n') # Import and evaluate each test batch using Matthew's correlation coefficient matthews_set = [] for i in range(len(true_labels)): matthews = matthews_corrcoef( true_labels[i], np.argmax(predictions[i], axis=1).flatten()) matthews_set.append(matthews) # Flatten the predictions and true values for aggregate Matthew's evaluation on the whole dataset flat_predictions = [ item for sublist in predictions for item in sublist ] flat_predictions = np.argmax(flat_predictions, axis=1).flatten() flat_true_labels = [ item for sublist in true_labels for item in sublist ] f_log.write( str(len(flat_predictions) + ' ' + len(flat_true_labels)) + '\n') f_log.write( str(flat_predictions[989:994] + ' ' + flat_true_labels[989:994]) + '\n') f_log.write( str(flat_predictions[0:11] + ' ' + flat_true_labels[0:11]) + '\n') f_log.write('Classification Report' + '\n') f_log.write( str( classification_report(flat_true_labels, flat_predictions)) + '\n') f_log.write( str(confusion_matrix(flat_true_labels, flat_predictions)) + '\n')
def train(self): if self.debug_mode: self.epochs = 1 # 加载 dataloader train_loader, valid_loader = self.create_dataloader() # 训练 self.seed_everything() lr = 2e-5 accumulation_steps = math.ceil(self.batch_size / self.base_batch_size) # 预训练 bert 转成 pytorch if os.path.exists(self.work_dir + 'pytorch_model.bin') is False: print("Convert pre-trained model") convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch( self.bert_model_path + 'bert_model.ckpt', self.bert_model_path + 'bert_config.json', self.work_dir + 'pytorch_model.bin') shutil.copyfile(self.bert_model_path + 'bert_config.json', self.work_dir + 'bert_config.json') # 加载预训练模型 print("Load checkpoint") model = BertNeuralNet.from_pretrained(self.work_dir, cache_dir=None) # TODO: 读取模型 model.load_state_dict( torch.load("../input/train48-bert-kernel/model_last.bin")) model.zero_grad() model = model.to(self.device) # 不同的参数组设置不同的 weight_decay param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] epoch_steps = int(self.train_len * 0.5 / self.base_batch_size / accumulation_steps) num_train_optimization_steps = int(self.epochs * epoch_steps) valid_every = math.floor(epoch_steps * accumulation_steps / 5) optimizer = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=-1, t_total=-1) # 渐变学习速率 #scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) # 开始训练 print("Train") best_auc_score_1 = 0 best_auc_score_2 = 0 best_auc_score_3 = 0 best_auc_score_4 = 0 f_log = open("train_log.txt", "w") model.eval() new_valid_loader = copy.deepcopy(valid_loader) y_pred = np.zeros((len(self.train_df) - self.train_len)) for j, valid_batch_data in enumerate(new_valid_loader): x_batch = valid_batch_data[0] batch_y_pred = self.sigmoid( model(x_batch.to(self.device), attention_mask=(x_batch > 0).to(self.device), labels=None).detach().cpu().numpy())[:, 0] y_pred[j * self.base_batch_size:(j + 1) * self.base_batch_size] = batch_y_pred # 计算得分 auc_score = self.evaluator.get_final_metric(y_pred) f_log.write("init auc_score: %.4f\n" % auc_score) print("init auc_score: %.4f" % auc_score) for epoch in range(self.epochs): model.train() optimizer.zero_grad() # 加载每个 batch 并训练 train_start_time = time.time() for i, batch_data in enumerate(train_loader): x_batch = batch_data[0] y_batch = batch_data[1] target_weight_batch = batch_data[2] aux_weight_batch = batch_data[3] identity_weight_batch = batch_data[4] np_weight_batch = batch_data[5] np_identity_weight_batch = batch_data[6] y_pred = model(x_batch.to(self.device), attention_mask=(x_batch > 0).to(self.device), labels=None) target_loss, aux_loss, identity_loss, np_loss = self.custom_loss( y_pred, y_batch, epoch, target_weight_batch, aux_weight_batch, identity_weight_batch, np_weight_batch) loss = target_loss + aux_loss + identity_loss + np_loss with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if (i + 1) % accumulation_steps == 0: optimizer.step() optimizer.zero_grad() # 验证 if (i + 1) % valid_every == 0: model.eval() stage = int((i + 1) / valid_every) train_stage_duration = int( (time.time() - train_start_time) / 60) valid_start_time = time.time() y_pred = np.zeros((len(self.train_df) - self.train_len)) for j, valid_batch_data in enumerate(valid_loader): x_batch = valid_batch_data[0] batch_y_pred = self.sigmoid( model(x_batch.to(self.device), attention_mask=(x_batch > 0).to(self.device), labels=None).detach().cpu().numpy())[:, 0] y_pred[j * self.base_batch_size:(j + 1) * self.base_batch_size] = batch_y_pred # 计算得分 auc_score = self.evaluator.get_final_metric(y_pred) valid_duration = int((time.time() - valid_start_time) / 60) train_start_time = time.time() f_log.write( "epoch: %d stage: %d train_stage_duration: %dmin valid_duration: %dmin auc_score: %.4f\n" % (epoch, stage, train_stage_duration, valid_duration, auc_score)) print( "epoch: %d stage: %d train_stage_duration: %dmin valid_duration: %dmin auc_score: %.4f" % (epoch, stage, train_stage_duration, valid_duration, auc_score)) if auc_score > best_auc_score_4: state_dict = model.state_dict() if auc_score > best_auc_score_1: best_auc_score_1 = auc_score torch.save(state_dict, "model1.bin") elif auc_score > best_auc_score_2: best_auc_score_2 = auc_score torch.save(state_dict, "model2.bin") elif auc_score > best_auc_score_3: best_auc_score_3 = auc_score torch.save(state_dict, "model3.bin") else: best_auc_score_4 = auc_score torch.save(state_dict, "model4.bin") with open("model_score.txt", "w") as f: f.write( "model1: %.4f model2: %.4f model3: %.4f model4: %.4f" % (best_auc_score_1, best_auc_score_2, best_auc_score_3, best_auc_score_4)) print( "model1: %.4f model2: %.4f model3: %.4f model4: %.4f" % (best_auc_score_1, best_auc_score_2, best_auc_score_3, best_auc_score_4)) model.train() if self.last is True: state_dict = model.state_dict() torch.save(state_dict, "model_last.bin") # del 训练相关输入和模型 training_history = [ train_loader, valid_loader, model, optimizer, param_optimizer, optimizer_grouped_parameters ] for variable in training_history: del variable gc.collect()
class MTDNNModel(MTDNNPretrainedModel): """Instance of an MTDNN Model Arguments: MTDNNPretrainedModel {BertPretrainedModel} -- Inherited from Bert Pretrained config {MTDNNConfig} -- MTDNN Configuration Object pretrained_model_name {str} -- Name of the pretrained model to initial checkpoint num_train_step {int} -- Number of steps to take each training Raises: RuntimeError: [description] ImportError: [description] Returns: MTDNNModel -- An Instance of an MTDNN Model """ def __init__( self, config: MTDNNConfig, task_defs: MTDNNTaskDefs, pretrained_model_name: str = "mtdnn-base-uncased", num_train_step: int = -1, decoder_opts: list = None, task_types: list = None, dropout_list: list = None, loss_types: list = None, kd_loss_types: list = None, tasks_nclass_list: list = None, multitask_train_dataloader: DataLoader = None, dev_dataloaders_list: list = None, # list of dataloaders test_dataloaders_list: list = None, # list of dataloaders test_datasets_list: list = ["mnli_mismatched", "mnli_matched"], output_dir: str = "checkpoint", log_dir: str = "tensorboard_logdir", ): # Input validation assert ( config.init_checkpoint in self.supported_init_checkpoints() ), f"Initial checkpoint must be in {self.supported_init_checkpoints()}" assert decoder_opts, "Decoder options list is required!" assert task_types, "Task types list is required!" assert dropout_list, "Task dropout list is required!" assert loss_types, "Loss types list is required!" assert kd_loss_types, "KD Loss types list is required!" assert tasks_nclass_list, "Tasks nclass list is required!" assert (multitask_train_dataloader ), "DataLoader for multiple tasks cannot be None" assert test_datasets_list, "Pass a list of test dataset prefixes" super(MTDNNModel, self).__init__(config) # Initialize model config and update with training options self.config = config self.update_config_with_training_opts( decoder_opts, task_types, dropout_list, loss_types, kd_loss_types, tasks_nclass_list, ) self.task_defs = task_defs self.multitask_train_dataloader = multitask_train_dataloader self.dev_dataloaders_list = dev_dataloaders_list self.test_dataloaders_list = test_dataloaders_list self.test_datasets_list = test_datasets_list self.output_dir = output_dir self.log_dir = log_dir # Create the output_dir if it's doesn't exist MTDNNCommonUtils.create_directory_if_not_exists(self.output_dir) self.tensor_board = SummaryWriter(log_dir=self.log_dir) self.pooler = None # Resume from model checkpoint if self.config.resume and self.config.model_ckpt: assert os.path.exists( self.config.model_ckpt), "Model checkpoint does not exist" logger.info(f"loading model from {self.config.model_ckpt}") self = self.load(self.config.model_ckpt) return # Setup the baseline network # - Define the encoder based on config options # - Set state dictionary based on configuration setting # - Download pretrained model if flag is set # TODO - Use Model.pretrained_model() after configuration file is hosted. if self.config.use_pretrained_model: with MTDNNCommonUtils.download_path() as file_path: path = pathlib.Path(file_path) self.local_model_path = MTDNNCommonUtils.maybe_download( url=self. pretrained_model_archive_map[pretrained_model_name], log=logger, ) self.bert_model = MTDNNCommonUtils.load_pytorch_model( self.local_model_path) self.state_dict = self.bert_model["state"] else: # Set the config base on encoder type set for initial checkpoint if config.encoder_type == EncoderModelType.BERT: self.bert_config = BertConfig.from_dict(self.config.to_dict()) self.bert_model = BertModel.from_pretrained( self.config.init_checkpoint) self.state_dict = self.bert_model.state_dict() self.config.hidden_size = self.bert_config.hidden_size if config.encoder_type == EncoderModelType.ROBERTA: # Download and extract from PyTorch hub if not downloaded before self.bert_model = torch.hub.load("pytorch/fairseq", config.init_checkpoint) self.config.hidden_size = self.bert_model.args.encoder_embed_dim self.pooler = LinearPooler(self.config.hidden_size) new_state_dict = {} for key, val in self.bert_model.state_dict().items(): if key.startswith("model.decoder.sentence_encoder" ) or key.startswith( "model.classification_heads"): key = f"bert.{key}" new_state_dict[key] = val # backward compatibility PyTorch <= 1.0.0 if key.startswith("classification_heads"): key = f"bert.model.{key}" new_state_dict[key] = val self.state_dict = new_state_dict self.updates = (self.state_dict["updates"] if self.state_dict and "updates" in self.state_dict else 0) self.local_updates = 0 self.train_loss = AverageMeter() self.network = SANBERTNetwork( init_checkpoint_model=self.bert_model, pooler=self.pooler, config=self.config, ) if self.state_dict: self.network.load_state_dict(self.state_dict, strict=False) self.mnetwork = (nn.DataParallel(self.network) if self.config.multi_gpu_on else self.network) self.total_param = sum([ p.nelement() for p in self.network.parameters() if p.requires_grad ]) # Move network to GPU if device available and flag set if self.config.cuda: self.network.cuda(device=self.config.cuda_device) self.optimizer_parameters = self._get_param_groups() self._setup_optim(self.optimizer_parameters, self.state_dict, num_train_step) self.para_swapped = False self.optimizer.zero_grad() self._setup_lossmap() def _get_param_groups(self): no_decay = [ "bias", "gamma", "beta", "LayerNorm.bias", "LayerNorm.weight" ] optimizer_parameters = [ { "params": [ p for n, p in self.network.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in self.network.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] return optimizer_parameters def _setup_optim(self, optimizer_parameters, state_dict: dict = None, num_train_step: int = -1): # Setup optimizer parameters if self.config.optimizer == "sgd": self.optimizer = optim.SGD( optimizer_parameters, self.config.learning_rate, weight_decay=self.config.weight_decay, ) elif self.config.optimizer == "adamax": self.optimizer = Adamax( optimizer_parameters, self.config.learning_rate, warmup=self.config.warmup, t_total=num_train_step, max_grad_norm=self.config.grad_clipping, schedule=self.config.warmup_schedule, weight_decay=self.config.weight_decay, ) elif self.config.optimizer == "radam": self.optimizer = RAdam( optimizer_parameters, self.config.learning_rate, warmup=self.config.warmup, t_total=num_train_step, max_grad_norm=self.config.grad_clipping, schedule=self.config.warmup_schedule, eps=self.config.adam_eps, weight_decay=self.config.weight_decay, ) # The current radam does not support FP16. self.config.fp16 = False elif self.config.optimizer == "adam": self.optimizer = Adam( optimizer_parameters, lr=self.config.learning_rate, warmup=self.config.warmup, t_total=num_train_step, max_grad_norm=self.config.grad_clipping, schedule=self.config.warmup_schedule, weight_decay=self.config.weight_decay, ) else: raise RuntimeError( f"Unsupported optimizer: {self.config.optimizer}") # Clear scheduler for certain optimizer choices if self.config.optimizer in ["adam", "adamax", "radam"]: if self.config.have_lr_scheduler: self.config.have_lr_scheduler = False if state_dict and "optimizer" in state_dict: self.optimizer.load_state_dict(state_dict["optimizer"]) if self.config.fp16: try: global amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize( self.network, self.optimizer, opt_level=self.config.fp16_opt_level) self.network = model self.optimizer = optimizer if self.config.have_lr_scheduler: if self.config.scheduler_type == "rop": self.scheduler = ReduceLROnPlateau(self.optimizer, mode="max", factor=self.config.lr_gamma, patience=3) elif self.config.scheduler_type == "exp": self.scheduler = ExponentialLR(self.optimizer, gamma=self.config.lr_gamma or 0.95) else: milestones = [ int(step) for step in ( self.config.multi_step_lr or "10,20,30").split(",") ] self.scheduler = MultiStepLR(self.optimizer, milestones=milestones, gamma=self.config.lr_gamma) else: self.scheduler = None def _setup_lossmap(self): self.task_loss_criterion = [] for idx, cs in enumerate(self.config.loss_types): assert cs is not None, "Loss type must be defined." lc = LOSS_REGISTRY[cs](name=f"Loss func of task {idx}: {cs}") self.task_loss_criterion.append(lc) def _setup_kd_lossmap(self): loss_types = self.config.kd_loss_types self.kd_task_loss_criterion = [] if config.mkd_opt > 0: for idx, cs in enumerate(loss_types): assert cs, "Loss type must be defined." lc = LOSS_REGISTRY[cs]( name="Loss func of task {}: {}".format(idx, cs)) self.kd_task_loss_criterion.append(lc) def _to_cuda(self, tensor): # Set tensor to gpu (non-blocking) if a PyTorch tensor if tensor is None: return tensor if isinstance(tensor, list) or isinstance(tensor, tuple): y = [ e.cuda(device=self.config.cuda_device, non_blocking=True) for e in tensor ] for t in y: t.requires_grad = False else: y = tensor.cuda(device=self.config.cuda_device, non_blocking=True) y.requires_grad = False return y def train(self): if self.para_swapped: self.para_swapped = False def update(self, batch_meta, batch_data): self.network.train() target = batch_data[batch_meta["label"]] soft_labels = None task_type = batch_meta["task_type"] target = self._to_cuda(target) if self.config.cuda else target task_id = batch_meta["task_id"] inputs = batch_data[:batch_meta["input_len"]] if len(inputs) == 3: inputs.append(None) inputs.append(None) inputs.append(task_id) weight = None if self.config.weighted_on: if self.config.cuda: weight = batch_data[batch_meta["factor"]].cuda( device=self.config.cuda_device, non_blocking=True) else: weight = batch_data[batch_meta["factor"]] logits = self.mnetwork(*inputs) # compute loss loss = 0 if self.task_loss_criterion[task_id] and (target is not None): loss = self.task_loss_criterion[task_id](logits, target, weight, ignore_index=-1) # compute kd loss if self.config.mkd_opt > 0 and ("soft_label" in batch_meta): soft_labels = batch_meta["soft_label"] soft_labels = (self._to_cuda(soft_labels) if self.config.cuda else soft_labels) kd_lc = self.kd_task_loss_criterion[task_id] kd_loss = (kd_lc(logits, soft_labels, weight, ignore_index=-1) if kd_lc else 0) loss = loss + kd_loss self.train_loss.update(loss.item(), batch_data[batch_meta["token_id"]].size(0)) # scale loss loss = loss / (self.config.grad_accumulation_step or 1) if self.config.fp16: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() self.local_updates += 1 if self.local_updates % self.config.grad_accumulation_step == 0: if self.config.global_grad_clipping > 0: if self.config.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(self.optimizer), self.config.global_grad_clipping, ) else: torch.nn.utils.clip_grad_norm_( self.network.parameters(), self.config.global_grad_clipping) self.updates += 1 # reset number of the grad accumulation self.optimizer.step() self.optimizer.zero_grad() def eval_mode( self, data: DataLoader, metric_meta, use_cuda=True, with_label=True, label_mapper=None, task_type=TaskType.Classification, ): if use_cuda: self.cuda() predictions = [] golds = [] scores = [] ids = [] metrics = {} for idx, (batch_info, batch_data) in enumerate(data): if idx % 100 == 0: logger.info(f"predicting {idx}") batch_info, batch_data = MTDNNCollater.patch_data( use_cuda, batch_info, batch_data) score, pred, gold = self._predict_batch(batch_info, batch_data) predictions.extend(pred) golds.extend(gold) scores.extend(score) ids.extend(batch_info["uids"]) if task_type == TaskType.Span: golds = merge_answers(ids, golds) predictions, scores = select_answers(ids, predictions, scores) if with_label: metrics = calc_metrics(metric_meta, golds, predictions, scores, label_mapper) return metrics, predictions, scores, golds, ids def _predict_batch(self, batch_meta, batch_data): self.network.eval() task_id = batch_meta["task_id"] task_type = batch_meta["task_type"] inputs = batch_data[:batch_meta["input_len"]] if len(inputs) == 3: inputs.append(None) inputs.append(None) inputs.append(task_id) score = self.mnetwork(*inputs) if task_type == TaskType.Ranking: score = score.contiguous().view(-1, batch_meta["pairwise_size"]) assert task_type == TaskType.Ranking score = F.softmax(score, dim=1) score = score.data.cpu() score = score.numpy() predict = np.zeros(score.shape, dtype=int) positive = np.argmax(score, axis=1) for idx, pos in enumerate(positive): predict[idx, pos] = 1 predict = predict.reshape(-1).tolist() score = score.reshape(-1).tolist() return score, predict, batch_meta["true_label"] elif task_type == TaskType.SequenceLabeling: mask = batch_data[batch_meta["mask"]] score = score.contiguous() score = score.data.cpu() score = score.numpy() predict = np.argmax(score, axis=1).reshape(mask.size()).tolist() valied_lenght = mask.sum(1).tolist() final_predict = [] for idx, p in enumerate(predict): final_predict.append(p[:valied_lenght[idx]]) score = score.reshape(-1).tolist() return score, final_predict, batch_meta["label"] elif task_type == TaskType.Span: start, end = score predictions = [] if self.config.encoder_type == EncoderModelType.BERT: scores, predictions = extract_answer( batch_meta, batch_data, start, end, self.config.get("max_answer_len", 5), ) return scores, predictions, batch_meta["answer"] else: if task_type == TaskType.Classification: score = F.softmax(score, dim=1) score = score.data.cpu() score = score.numpy() predict = np.argmax(score, axis=1).tolist() score = score.reshape(-1).tolist() return score, predict, batch_meta["label"] def fit(self, epochs=0): """ Fit model to training datasets """ epochs = epochs or self.config.epochs logger.info(f"Total number of params: {self.total_param}") for epoch in range(epochs): logger.info(f"At epoch {epoch}") logger.info( f"Amount of data to go over: {len(self.multitask_train_dataloader)}" ) start = datetime.now() # Create batches and train for idx, (batch_meta, batch_data) in enumerate( self.multitask_train_dataloader): batch_meta, batch_data = MTDNNCollater.patch_data( self.config.cuda, batch_meta, batch_data) task_id = batch_meta["task_id"] self.update(batch_meta, batch_data) if (self.local_updates == 1 or (self.local_updates) % (self.config.log_per_updates * self.config.grad_accumulation_step) == 0): time_left = str((datetime.now() - start) / (idx + 1) * (len(self.multitask_train_dataloader) - idx - 1)).split(".")[0] logger.info( "Task - [{0:2}] Updates - [{1:6}] Training Loss - [{2:.5f}] Time Remaining - [{3}]" .format( task_id, self.updates, self.train_loss.avg, time_left, )) if self.config.use_tensor_board: self.tensor_board.add_scalar( "train/loss", self.train_loss.avg, global_step=self.updates, ) if self.config.save_per_updates_on and ( (self.local_updates) % (self.config.save_per_updates * self.config.grad_accumulation_step) == 0): model_file = os.path.join( self.output_dir, "model_{}_{}.pt".format(epoch, self.updates), ) logger.info(f"Saving mt-dnn model to {model_file}") self.save(model_file) # TODO: Alternatively, we need to refactor save function # and move into prediction # Saving each checkpoint after model training model_file = os.path.join(self.output_dir, "model_{}.pt".format(epoch)) logger.info(f"Saving mt-dnn model to {model_file}") self.save(model_file) def predict(self, trained_model_chckpt: str = None, saved_epoch_idx: int = 0): """ Inference of model on test datasets """ # Load a trained checkpoint if a valid model checkpoint if trained_model_chckpt and os.path.exists(trained_model_chckpt): logger.info(f"Running predictions using: {trained_model_chckpt}") self.load(trained_model_chckpt) # Create batches and train start = datetime.now() for idx, dataset in enumerate(self.test_datasets_list): prefix = dataset.split("_")[0] label_dict = self.task_defs.global_map.get(prefix, None) dev_data: DataLoader = self.dev_dataloaders_list[idx] if dev_data is not None: with torch.no_grad(): ( dev_metrics, dev_predictions, scores, golds, dev_ids, ) = self.eval_mode( dev_data, metric_meta=self.task_defs.metric_meta_map[prefix], use_cuda=self.config.cuda, label_mapper=label_dict, task_type=self.task_defs.task_type_map[prefix], ) for key, val in dev_metrics.items(): if self.config.use_tensor_board: self.tensor_board.add_scalar( f"dev/{dataset}/{key}", val, global_step=saved_epoch_idx) if isinstance(val, str): logger.info( f"Task {dataset} -- epoch {saved_epoch_idx} -- Dev {key}:\n {val}" ) else: logger.info( f"Task {dataset} -- epoch {saved_epoch_idx} -- Dev {key}: {val:.3f}" ) score_file = os.path.join( self.output_dir, f"{dataset}_dev_scores_{saved_epoch_idx}.json") results = { "metrics": dev_metrics, "predictions": dev_predictions, "uids": dev_ids, "scores": scores, } # Save results to file MTDNNCommonUtils.dump(score_file, results) if self.config.use_glue_format: official_score_file = os.path.join( self.output_dir, "{}_dev_scores_{}.tsv".format(dataset, saved_epoch_idx), ) submit(official_score_file, results, label_dict) # test eval test_data: DataLoader = self.test_dataloaders_list[idx] if test_data is not None: with torch.no_grad(): ( test_metrics, test_predictions, scores, golds, test_ids, ) = self.eval_mode( test_data, metric_meta=self.task_defs.metric_meta_map[prefix], use_cuda=self.config.cuda, with_label=False, label_mapper=label_dict, task_type=self.task_defs.task_type_map[prefix], ) score_file = os.path.join( self.output_dir, f"{dataset}_test_scores_{saved_epoch_idx}.json") results = { "metrics": test_metrics, "predictions": test_predictions, "uids": test_ids, "scores": scores, } MTDNNCommonUtils.dump(score_file, results) if self.config.use_glue_format: official_score_file = os.path.join( self.output_dir, f"{dataset}_test_scores_{saved_epoch_idx}.tsv") submit(official_score_file, results, label_dict) logger.info("[new test scores saved.]") # Close tensorboard connection if opened self.close_connections() def close_connections(self): # Close tensor board connection if self.config.use_tensor_board: self.tensor_board.close() def extract(self, batch_meta, batch_data): self.network.eval() # 'token_id': 0; 'segment_id': 1; 'mask': 2 inputs = batch_data[:3] all_encoder_layers, pooled_output = self.mnetwork.bert(*inputs) return all_encoder_layers, pooled_output def save(self, filename): network_state = dict([(k, v.cpu()) for k, v in self.network.state_dict().items()]) params = { "state": network_state, "optimizer": self.optimizer.state_dict(), "config": self.config, } torch.save(params, filename) logger.info("model saved to {}".format(filename)) def load(self, checkpoint): model_state_dict = torch.load(checkpoint) self.network.load_state_dict(model_state_dict["state"], strict=False) self.optimizer.load_state_dict(model_state_dict["optimizer"]) self.config = model_state_dict["config"] def cuda(self): self.network.cuda(device=self.config.cuda_device) def supported_init_checkpoints(self): """List of allowed check points """ return [ "bert-base-uncased", "bert-base-cased", "bert-large-uncased", "mtdnn-base-uncased", "mtdnn-large-uncased", "roberta.base", "roberta.large", ] def update_config_with_training_opts( self, decoder_opts, task_types, dropout_list, loss_types, kd_loss_types, tasks_nclass_list, ): # Update configurations with options obtained from preprocessing training data setattr(self.config, "decoder_opts", decoder_opts) setattr(self.config, "task_types", task_types) setattr(self.config, "tasks_dropout_p", dropout_list) setattr(self.config, "loss_types", loss_types) setattr(self.config, "kd_loss_types", kd_loss_types) setattr(self.config, "tasks_nclass_list", tasks_nclass_list)
def main(): parser = argparse.ArgumentParser() arg = parser.add_argument arg('mode', choices=['train', 'validate', 'predict', 'train_all']) arg('run_root') arg('--model', default='mybert') arg('--pretrained', type=int, default=0) arg('--batch-size', type=int, default=32) arg('--step', type=int, default=1) arg('--workers', type=int, default=2) arg('--lr', type=float, default=0.0002) arg('--patience', type=int, default=4) arg('--clean', action='store_true') arg('--n-epochs', type=int, default=1) arg('--kloss', type=float, default=1.0) arg('--loss_fn', default='loss1') arg('--fold_name', default='/folds_binary_weights_kernal.pkl') arg('--limit', type=int) arg('--fold', type=int, default=0) arg('--multi-gpu', type=int, default=0) arg('--lr_layerdecay', type=float, default=0.95) arg('--warmup', type=float, default=0.05) arg('--split_point', type=float, default=0.3) arg('--bsample', type=bool, default=True) args = parser.parse_args() set_seed() BERT_PRETRAIN_PATH = '../input/torch-bert-weights/%s/' % (args.model) run_root = Path('../experiments/' + args.run_root) DATA_ROOT = Path( '../input/jigsaw-unintended-bias-in-toxicity-classification') folds = pd.read_pickle(DATA_ROOT / 'folds.pkl') identity_columns = [ 'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black', 'white', 'psychiatric_or_mental_illness' ] weights = np.ones((len(folds), )) / 4 # # Subgroup weights += (folds[identity_columns].fillna(0).values >= 0.5).sum( axis=1).astype(bool).astype(np.int) / 4 # # Background Positive, Subgroup Negative weights += (((folds['target'].values >= 0.5).astype(bool).astype(np.int) + (folds[identity_columns].fillna(0).values < 0.5).sum( axis=1).astype(bool).astype(np.int)) > 1).astype(bool).astype(np.int) / 4 # # Background Negative, Subgroup Positive weights += (((folds['target'].values < 0.5).astype(bool).astype(np.int) + (folds[identity_columns].fillna(0).values >= 0.5).sum( axis=1).astype(bool).astype(np.int)) > 1).astype(bool).astype(np.int) / 4 folds['weights'] = weights print(folds['weights'].mean()) if args.mode == "train_all": train_fold = folds else: train_fold = folds[folds['fold'] != args.fold] valid_fold = folds[folds['fold'] == args.fold] valid_fold = valid_fold.sort_values(by=["len"]) if args.limit: train_fold = train_fold[:args.limit] if args.mode != "train_all": valid_fold = valid_fold[:args.limit * 3] if args.mode == "train_all": valid_df = None else: valid_df = valid_fold[identity_columns + ["target"]] loss_weight = 1 / folds['weights'].mean() * args.kloss if args.loss_fn == "loss1": loss_fn = custom_loss elif args.loss_fn == "loss2": loss_fn = custom_loss2 criterion = partial(loss_fn, loss_weight=loss_weight) if args.mode == 'train' or args.mode == "train_all": if run_root.exists() and args.clean: shutil.rmtree(run_root) run_root.mkdir(exist_ok=True, parents=True) (run_root / 'params.json').write_text( json.dumps(vars(args), indent=4, sort_keys=True)) training_set = TrainDataset(train_fold['comment_text'].tolist(), lens=train_fold['len'].tolist(), target=train_fold[[ 'binary_target', 'weights', 'target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat' ]].values.tolist(), identity_df=train_fold[identity_columns], weights=train_fold['weights'].tolist(), model=args.model, split_point=args.split_point) if args.bsample: bbsampler = BucketBatchSampler(training_set, batch_size=args.batch_size, drop_last=True, sort_key=lambda x: x[1], biggest_batches_first=None, bucket_size_multiplier=100, shuffle=True) batchsize = 1 shuffle = False else: bbsampler = None batchsize = args.batch_size shuffle = True training_loader = DataLoader(training_set, batch_sampler=bbsampler, collate_fn=collate_fn, num_workers=args.workers, batch_size=batchsize, shuffle=shuffle) if args.mode == "train": valid_set = TrainDataset( valid_fold['comment_text'].tolist(), lens=valid_fold['len'].tolist(), target=valid_fold['binary_target'].values.tolist(), identity_df=valid_fold[identity_columns], weights=valid_fold['weights'].tolist(), model=args.model, split_point=args.split_point) valid_loader = DataLoader(valid_set, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn, num_workers=args.workers) else: valid_loader = None # model = BertForSequenceClassification.from_pretrained(BERT_PRETRAIN_PATH,cache_dir=None,num_labels=1) model = BertModel(BERT_PRETRAIN_PATH) model.cuda() if args.model in [ "bert-base-uncased", "bert-base-cased", "mybert", "gpt2", 'mybert-base-cased', 'mybert-base-uncased' ]: NUM_LAYERS = 12 elif args.model in [ "bert-large-uncased", "bert-large-cased", "mybertlarge", "wmm", "mybertlargecased", "mybert-large-uncased", 'mybert-wwm-uncased' ]: NUM_LAYERS = 24 else: raise ValueError('%s is not a valid model' % args.model) optimizer_grouped_parameters = [{ 'params': model.bert.bert.embeddings.parameters(), 'lr': args.lr * (args.lr_layerdecay**NUM_LAYERS) }, { 'params': model.main_head.parameters(), 'lr': args.lr }, { 'params': model.aux_head.parameters(), 'lr': args.lr }, { 'params': model.bert.bert.pooler.parameters(), 'lr': args.lr }] for layer in range(NUM_LAYERS): optimizer_grouped_parameters.append( { 'params': model.bert.bert.encoder.layer.__getattr__( '%d' % (NUM_LAYERS - 1 - layer)).parameters(), 'lr': args.lr * (args.lr_layerdecay**layer) }, ) optimizer = BertAdam(optimizer_grouped_parameters, lr=args.lr, warmup=args.warmup, t_total=len(training_loader) // args.step) scheduler = ReduceLROnPlateau(optimizer, patience=0, factor=0.1, verbose=True, mode='max', min_lr=1e-7) model, optimizer = amp.initialize(model, optimizer, opt_level="O2", verbosity=0) optimizer.zero_grad() if args.multi_gpu == 1: model = nn.DataParallel(model) train(args, model, optimizer, scheduler, criterion, train_loader=training_loader, valid_df=valid_df, valid_loader=valid_loader, epoch_length=len(training_set)) elif args.mode == 'validate': valid_set = TrainDataset(valid_fold['comment_text'].tolist(), lens=valid_fold['len'].tolist(), target=valid_fold[['binary_target' ]].values.tolist(), identity_df=valid_fold[identity_columns], weights=valid_fold['weights'].tolist(), model=args.model, split_point=args.split_point) valid_loader = DataLoader(valid_set, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn, num_workers=args.workers) model = BertModel(BERT_PRETRAIN_PATH) load_model(model, run_root / ('best-model-%d.pt' % args.fold), multi2single=False) model.cuda() optimizer = BertAdam(model.parameters(), lr=1e-5, warmup=0.95) model, optimizer = amp.initialize(model, optimizer, opt_level="O2", verbosity=0) if args.multi_gpu == 1: model = nn.DataParallel(model) validation(model, criterion, valid_df, valid_loader, args, save_result=True, progress=True)
def fine_tune(corpus_name, train_corpus, dev_corpus, column_names): device = get_cuda_device() train_corpus_df = parse_csv(train_corpus, column_names) input_ids, labels, attention_masks = RENAME_ME(train_corpus_df, corpus_name, True, MAX_LEN) train_inputs, test_inputs, train_labels, test_labels = train_test_split( input_ids, labels, random_state=RANDOM_STATE, test_size=TEST_SIZE) train_masks, test_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=RANDOM_STATE, test_size=TEST_SIZE) train_data_loader = get_data_loader(train_inputs, train_labels, train_masks, BATCH_SIZE) test_data_loader = get_data_loader(test_inputs, test_labels, test_masks, BATCH_SIZE) model = BertForSequenceClassification.from_pretrained(corpus_name, num_labels=2) model.cuda() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=2e-5, warmup=.1) for _ in range(EPOCHS): model.train() for step, batch in enumerate(train_data_loader): input_ids, mask, labels = tuple(t.to(device) for t in batch) optimizer.zero_grad() loss = model(input_ids, token_type_ids=None, attention_mask=mask, labels=labels) loss.backward() optimizer.step() model.eval() for batch in test_data_loader: input_ids, mask, labels = tuple(t.to(device) for t in batch) with torch.no_grad(): logits = model(input_ids, token_type_ids=None, attention_mask=mask) #logits = logits.detach().cpu().numpy() #label_ids = labels.to('cpu').numpy() dev_corpus_df = parse_csv(dev_corpus, column_names) dev_input_ids, dev_labels, dev_attentions_masks = RENAME_ME( dev_corpus_df, corpus_name, True, MAX_LEN) dev_data_loader = get_data_loader(dev_input_ids, dev_labels, dev_masks, BATCH_SIZE) model.eval() predictions = [] true_labels = [] for batch in prediction_data_loader: input_ids, mask, labels = tuple(t.to(device) for t in batch) with torch.no_grad(): logits = model(input_ids, token_type_ids=None, attention_mask=mask) logits = logits.detach().cpu().numpy() label_ids = labels.to('cpu').numpy() predictions.append(logits) true_labels.append(label_ids) matthews_set = [] for true_label, prediction in zip(true_labels, predictions): matthews = matthews_corrcoef(true_label, np.argmax(prediction, axis=1).flatten()) matthews_set.append(matthews) flat_predictions = [item for sublist in predictions for item in sublist] flat_predictions = np.argmax(flat_predictions, axis=1).flatten() flat_true_labels = [item for sublist in true_labels for item in sublist] matthews_corrcoef(flat_true_labels, flat_predictions)
class MTDNNModel(object): def __init__(self, opt, state_dict=None, num_train_step=-1, use_parse=False, embedding_matrix=None, token2idx=None, stx_parse_dim=None, unked_words=None, use_generic_features=False, num_generic_features=None, use_domain_features=False, num_domain_features=None, feature_dim=None): self.config = opt self.updates = state_dict[ 'updates'] if state_dict and 'updates' in state_dict else 0 self.train_loss = AverageMeter() self.network = SANBertNetwork( opt, use_parse=use_parse, embedding_matrix=embedding_matrix, token2idx=token2idx, stx_parse_dim=stx_parse_dim, unked_words=unked_words, use_generic_features=use_generic_features, num_generic_features=num_generic_features, use_domain_features=use_domain_features, num_domain_features=num_domain_features, feature_dim=feature_dim) if state_dict: new_state = set(self.network.state_dict().keys()) for k in list(state_dict['state'].keys()): if k not in new_state: del state_dict['state'][k] for k, v in list(self.network.state_dict().items()): if k not in state_dict['state']: state_dict['state'][k] = v self.network.load_state_dict(state_dict['state']) self.mnetwork = nn.DataParallel( self.network) if opt['multi_gpu_on'] else self.network self.total_param = sum([ p.nelement() for p in self.network.parameters() if p.requires_grad ]) no_decay = [ 'bias', 'gamma', 'beta', 'LayerNorm.bias', 'LayerNorm.weight' ] optimizer_parameters = [{ 'params': [ p for n, p in self.network.named_parameters() if n not in no_decay ], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in self.network.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0 }] # note that adamax are modified based on the BERT code if opt['optimizer'] == 'sgd': self.optimizer = optim.sgd(optimizer_parameters, opt['learning_rate'], weight_decay=opt['weight_decay']) elif opt['optimizer'] == 'adamax': self.optimizer = Adamax(optimizer_parameters, opt['learning_rate'], warmup=opt['warmup'], t_total=num_train_step, max_grad_norm=opt['grad_clipping'], schedule=opt['warmup_schedule']) if opt.get('have_lr_scheduler', False): opt['have_lr_scheduler'] = False elif opt['optimizer'] == 'adadelta': self.optimizer = optim.Adadelta(optimizer_parameters, opt['learning_rate'], rho=0.95) elif opt['optimizer'] == 'adam': self.optimizer = Adam(optimizer_parameters, lr=opt['learning_rate'], warmup=opt['warmup'], t_total=num_train_step, max_grad_norm=opt['grad_clipping'], schedule=opt['warmup_schedule']) if opt.get('have_lr_scheduler', False): opt['have_lr_scheduler'] = False else: raise RuntimeError('Unsupported optimizer: %s' % opt['optimizer']) if state_dict and 'optimizer' in state_dict: self.optimizer.load_state_dict(state_dict['optimizer']) if opt.get('have_lr_scheduler', False): if opt.get('scheduler_type', 'rop') == 'rop': self.scheduler = ReduceLROnPlateau(self.optimizer, mode='max', factor=opt['lr_gamma'], patience=3) elif opt.get('scheduler_type', 'rop') == 'exp': self.scheduler = ExponentialLR(self.optimizer, gamma=opt.get('lr_gamma', 0.95)) else: milestones = [ int(step) for step in opt.get('multi_step_lr', '10,20,30').split(',') ] self.scheduler = MultiStepLR(self.optimizer, milestones=milestones, gamma=opt.get('lr_gamma')) else: self.scheduler = None self.ema = None if opt['ema_opt'] > 0: self.ema = EMA(self.config['ema_gamma'], self.network) self.para_swapped = False def setup_ema(self): if self.config['ema_opt']: self.ema.setup() def update_ema(self): if self.config['ema_opt']: self.ema.update() def eval(self): if self.config['ema_opt']: self.ema.swap_parameters() self.para_swapped = True def train(self): if self.para_swapped: self.ema.swap_parameters() self.para_swapped = False def _value_for(self, key, batch_data, batch_meta): return batch_data[batch_meta[key]] if key in batch_meta else None def update(self, batch_meta, batch_data, bin_parse_as=None, bin_parse_bs=None, parse_as_mask=None, parse_bs_mask=None, generic_features=None, domain_features=None): self.network.train() labels = batch_data[batch_meta['label']] if batch_meta['pairwise']: labels = labels.contiguous().view(-1, batch_meta['pairwise_size'])[:, 0] if self.config['cuda']: y = Variable(labels.cuda(non_blocking=True), requires_grad=False) else: y = Variable(labels, requires_grad=False) task_id = batch_meta['task_id'] task_type = batch_meta['task_type'] inputs = batch_data[:batch_meta['input_len']] if len(inputs) == 3: inputs.append(None) inputs.append(None) inputs.append(task_id) logits = self.mnetwork( *inputs, bin_parse_as=self._value_for('parse_ids_a', batch_data, batch_meta), bin_parse_bs=self._value_for('parse_ids_b', batch_data, batch_meta), parse_as_mask=self._value_for('parse_masks_a', batch_data, batch_meta), parse_bs_mask=self._value_for('parse_masks_b', batch_data, batch_meta), generic_features=self._value_for('generic_features', batch_data, batch_meta), domain_features=self._value_for('domain_features', batch_data, batch_meta)) if batch_meta['pairwise']: logits = logits.view(-1, batch_meta['pairwise_size']) if self.config.get('weighted_on', False): if self.config['cuda']: weight = Variable( batch_data[batch_meta['factor']].cuda(non_blocking=True)) else: weight = Variable(batch_data[batch_meta['factor']]) if task_type > 0: loss = torch.mean( F.mse_loss(logits.squeeze(), y, reduce=False) * weight) else: loss = torch.mean( F.cross_entropy(logits, y, reduce=False) * weight) else: if task_type > 0: loss = F.mse_loss(logits.squeeze(), y) else: loss = F.cross_entropy(logits, y) self.train_loss.update(loss.item(), logits.size(0)) self.optimizer.zero_grad() loss.backward() if self.config['global_grad_clipping'] > 0: torch.nn.utils.clip_grad_norm_(self.network.parameters(), self.config['global_grad_clipping']) self.optimizer.step() self.updates += 1 self.update_ema() def predict(self, batch_meta, batch_data, bin_parse_as=None, bin_parse_bs=None, parse_as_mask=None, parse_bs_mask=None, generic_features=None, domain_features=None): self.network.eval() task_id = batch_meta['task_id'] task_type = batch_meta['task_type'] inputs = batch_data[:batch_meta['input_len']] if len(inputs) == 3: inputs.append(None) inputs.append(None) inputs.append(task_id) with torch.no_grad(): score = self.network( *inputs, bin_parse_as=self._value_for('parse_ids_a', batch_data, batch_meta), bin_parse_bs=self._value_for('parse_ids_b', batch_data, batch_meta), parse_as_mask=self._value_for('parse_masks_a', batch_data, batch_meta), parse_bs_mask=self._value_for('parse_masks_b', batch_data, batch_meta), generic_features=self._value_for('generic_features', batch_data, batch_meta), domain_features=self._value_for('domain_features', batch_data, batch_meta)) if batch_meta['pairwise']: score = score.contiguous().view(-1, batch_meta['pairwise_size']) if task_type < 1: score = F.softmax(score, dim=1) score = score.data.cpu() score = score.numpy() predict = np.zeros(score.shape, dtype=int) positive = np.argmax(score, axis=1) for idx, pos in enumerate(positive): predict[idx, pos] = 1 predict = predict.reshape(-1).tolist() score = score.reshape(-1).tolist() return score, predict, batch_meta['true_label'] else: if task_type < 1: score = F.softmax(score, dim=1) score = score.data.cpu() score = score.numpy() predict = np.argmax(score, axis=1).tolist() score = score.reshape(-1).tolist() return score, predict, batch_meta['label'] def save(self, filename): network_state = dict([(k, v.cpu()) for k, v in self.network.state_dict().items()]) ema_state = dict([ (k, v.cpu()) for k, v in self.ema.model.state_dict().items() ]) if self.ema is not None else dict() params = { 'state': network_state, #'optimizer': self.optimizer.state_dict(), 'ema': ema_state, 'config': self.config, } torch.save(params, filename) logger.info('model saved to {}'.format(filename)) def cuda(self): self.network.cuda() if self.config['ema_opt']: self.ema.cuda()
def train(args): label_name = ['fake', 'real'] device = torch.device("cuda:0" if args['CUDA'] == 'gpu' else "cpu") prefix = args['MODEL'] + '_' + args['BERT_CONFIG'] bert_size = args['BERT_CONFIG'].split('-')[1] start_time = time.time() print('Importing data...', file=sys.stderr) df_train = pd.read_csv(args['--train'], index_col=0) df_val = pd.read_csv(args['--dev'], index_col=0) train_label = dict(df_train.information_label.value_counts()) print("Train label", train_label) label_max = float(max(train_label.values())) print("Label max", label_max) train_label_weight = torch.tensor( [label_max / train_label[i] for i in range(len(train_label))], device=device) print(train_label_weight) print('Done! time elapsed %.2f sec' % (time.time() - start_time), file=sys.stderr) print('-' * 80, file=sys.stderr) start_time = time.time() print('Set up model...', file=sys.stderr) if args['MODEL'] == 'cnn': model = CustomBertConvModel(args['BERT_CONFIG'], device, float(args['--dropout']), len(label_name), out_channel=int(args['--out-channel'])) optimizer = BertAdam([{ 'params': model.bert.parameters() }, { 'params': model.conv.parameters(), 'lr': float(args['--lr']) }, { 'params': model.hidden_to_softmax.parameters(), 'lr': float(args['--lr']) }], lr=float(args['--lr-bert']), max_grad_norm=float(args['--clip-grad'])) elif args['MODEL'] == 'lstm': model = CustomBertLSTMModel(args['BERT_CONFIG'], device, float(args['--dropout']), len(label_name), lstm_hidden_size=int( args['--hidden-size'])) optimizer = BertAdam([{ 'params': model.bert.parameters() }, { 'params': model.lstm.parameters(), 'lr': float(args['--lr']) }, { 'params': model.hidden_to_softmax.parameters(), 'lr': float(args['--lr']) }], lr=float(args['--lr-bert']), max_grad_norm=float(args['--clip-grad'])) else: print('please input valid model') exit(0) model = model.to(device) print('Use device: %s' % device, file=sys.stderr) print('Done! time elapsed %.2f sec' % (time.time() - start_time), file=sys.stderr) print('-' * 80, file=sys.stderr) model.train() cn_loss = torch.nn.CrossEntropyLoss(weight=train_label_weight, reduction='mean') torch.save(cn_loss, 'loss_func') # for later testing train_batch_size = int(args['--batch-size']) valid_niter = int(args['--valid-niter']) log_every = int(args['--log-every']) model_save_path = prefix + '_model.bin' num_trial = 0 train_iter = patience = cum_loss = report_loss = 0 cum_examples = report_examples = epoch = 0 hist_valid_scores = [] train_time = begin_time = time.time() print('Begin Maximum Likelihood training...') while True: epoch += 1 for sents, targets in batch_iter(df_train, batch_size=train_batch_size, shuffle=True, bert=bert_size): # for each epoch train_iter += 1 # increase training iteration # set gradients to zero before starting to do backpropagation. # Pytorch accummulates the gradients on subsequnt backward passes. optimizer.zero_grad() batch_size = len(sents) pre_softmax = model(sents).double() loss = cn_loss( pre_softmax, torch.tensor(targets, dtype=torch.long, device=device)) # The gradients are "stored" by the tensors themselves once you call backwards # on the loss. loss.backward() ''' After computing the gradients for all tensors in the model, calling optimizer.step() makes the optimizer iterate over all parameters (tensors) it is supposed to update and use their internally stored grad to update their values. ''' optimizer.step() # loss.item() contains the loss for the mini-batch, but divided by the batch_size # that's why multiply by the batch_size batch_losses_val = loss.item() * batch_size report_loss += batch_losses_val cum_loss += batch_losses_val report_examples += batch_size cum_examples += batch_size if train_iter % log_every == 0: print('epoch %d, iter %d, avg. loss %.2f, ' 'cum. examples %d, speed %.2f examples/sec, ' 'time elapsed %.2f sec' % (epoch, train_iter, report_loss / report_examples, cum_examples, report_examples / (time.time() - train_time), time.time() - begin_time), file=sys.stderr) train_time = time.time() report_loss = report_examples = 0. # perform validation if train_iter % valid_niter == 0: print( 'epoch %d, iter %d, cum. loss %.2f, cum. examples %d' % (epoch, train_iter, cum_loss / cum_examples, cum_examples), file=sys.stderr) cum_loss = cum_examples = 0 print('begin validation....', file=sys.stderr) validation_loss = validation( model, df_val, bert_size, cn_loss, device) # dev batch size can be a bit larger print('validation: iter %d, loss %f' % (train_iter, validation_loss), file=sys.stderr) is_better = len( hist_valid_scores ) == 0 or validation_loss < min(hist_valid_scores) hist_valid_scores.append(validation_loss) if is_better: patience = 0 print('save currently the best model to [%s]' % model_save_path, file=sys.stderr) model.save(model_save_path) # also save the optimizers' state torch.save(optimizer.state_dict(), model_save_path + '.optim') elif patience < int(args['--patience']): patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == int(args['--patience']): num_trial += 1 print('hit #%d trial' % num_trial, file=sys.stderr) if num_trial == int(args['--max-num-trial']): print('early stop!', file=sys.stderr) exit(0) # decay lr, and restore from previously best checkpoint print( 'load previously best model and decay learning rate to %f%%' % (float(args['--lr-decay']) * 100), file=sys.stderr) # load model params = torch.load( model_save_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) model = model.to(device) print('restore parameters of the optimizers', file=sys.stderr) optimizer.load_state_dict( torch.load(model_save_path + '.optim')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] *= float(args['--lr-decay']) # reset patience patience = 0 if epoch == int(args['--max-epoch']): print('reached maximum number of epochs!', file=sys.stderr) exit(0)
def train_topic(st, tt): device = torch.device("cuda:1") # with open('train_sem_mask.pickle', 'rb') as f: # train_dataeval_mask_set = pickle.load(f) # with open('test_sem_mask.pickle', 'rb') as f: # test_dataeval_mask_set = pickle.load(f) # with open('train_sem.pickle', 'rb') as f: # train_dataeval_set = pickle.load(f) # with open('test_sem.pickle', 'rb') as f: # test_dataeval_set = pickle.load(f) # with open('framenet.pickle', 'rb') as f: # test_framenet_set = pickle.load(f) # with open('framenet_mask.pickle', 'rb') as f: # test_framenet_mask_set = pickle.load(f) # with open('data_seen.pickle', 'rb') as f: # data = pickle.load(f) # train_set, test_set = data['train'], data['test'] # with open('data_seen_mask.pickle', 'rb') as f: # data = pickle.load(f) # train_set_mask, test_set_mask = data['train'], data['test'] ### Reading data... with open('data.pickle', 'rb') as f: data = pickle.load(f) # train_set, test_set = split_train_test(data) train_set = get_topic(data, st) test_set = get_topic(data, tt) with open('data_mask.pickle', 'rb') as f: data_mask = pickle.load(f) #train_set_mask, test_set_mask = split_train_test(data) train_set_mask = get_topic(data_mask, st) test_set_mask = get_topic(data_mask, tt) test_set, test_set_mask = test_set, test_set_mask train_pair = list(zip(train_set, train_set_mask)) train_pair = negative_sampling(train_pair, 0.8) train_set, train_set_mask = [d[0] for d in train_pair ], [d[1] for d in train_pair] ### test_dataset = Dataset(10, test_set) test_dataset_mask = Dataset(10, test_set_mask) test_dataset_batch = [ batch for batch in test_dataset.reader(device, False) ] test_dataset_mask_batch = [ batch for batch in test_dataset_mask.reader(device, False) ] test_dataset_mix = list(zip(test_dataset_batch, test_dataset_mask_batch)) ### train_dataset = Dataset(20, train_set) train_dataset_mask = Dataset(20, train_set_mask) train_dataset_batch = [ batch for batch in train_dataset.reader(device, False) ] train_dataset_mask_batch = [ batch for batch in train_dataset_mask.reader(device, False) ] train_dataset_mix = list(zip(train_dataset_batch, train_dataset_mask_batch)) model = BertCausalModel(3).to(device) model_mask = BertCausalModel(3).to(device) learning_rate = 1e-5 optimizer = BertAdam(model.parameters(), lr=learning_rate) optimizer_mask = BertAdam(model_mask.parameters(), lr=learning_rate) loss_fn = torch.nn.CrossEntropyLoss(reduction='sum') for _ in range(0, 20): idx = 0 for batch, batch_mask in tqdm(train_dataset_mix, mininterval=2, total=len(train_dataset_mix), file=sys.stdout, ncols=80): idx += 1 model.train() model_mask.train() sentences_s, mask_s, sentences_t, mask_t, event1, event1_mask, event2, event2_mask, data_y, _ = batch sentences_s_mask = batch_mask[0] opt = model.forward_logits(sentences_s, mask_s, sentences_t, mask_t, event1, event1_mask, event2, event2_mask) opt_mask = model_mask.forward_logits(sentences_s_mask, mask_s, sentences_t, mask_t, event1, event1_mask, event2, event2_mask) opt_mix = torch.cat([opt, opt_mask], dim=-1) logits = model.additional_fc(opt_mix) loss = loss_fn(logits, data_y) optimizer.zero_grad() optimizer_mask.zero_grad() loss.backward() optimizer.step() optimizer_mask.step() model.eval() model_mask.eval() with torch.no_grad(): predicted_all = [] gold_all = [] for batch, batch_mask in test_dataset_mix: sentences_s, mask_s, sentences_t, mask_t, event1, event1_mask, event2, event2_mask, data_y, _ = batch sentences_s_mask = batch_mask[0] opt = model.forward_logits(sentences_s, mask_s, sentences_t, mask_t, event1, event1_mask, event2, event2_mask) opt_mask = model_mask.forward_logits(sentences_s_mask, mask_s, sentences_t, mask_t, event1, event1_mask, event2, event2_mask) opt_mix = torch.cat([opt, opt_mask], dim=-1) logits = model.additional_fc(opt_mix) predicted = torch.argmax(logits, -1) predicted = list(predicted.cpu().numpy()) predicted_all += predicted gold = list(data_y.cpu().numpy()) gold_all += gold p, r, f = compute_f1(gold_all, predicted_all) print(p, r, f) print('Here')
class ClassificationModel: def __init__(self, bert_model=config.bert_model, gpu=False, seed=0): self.gpu = gpu self.bert_model = bert_model self.train_df = data_reader.load_train_dataset(config.data_path) self.val_df = data_reader.load_dev_dataset(config.data_path) self.test_df = data_reader.load_test_dataset(config.data_path) self.num_classes = len(LABELS) self.model = None self.optimizer = None self.tokenizer = BertTokenizer.from_pretrained(self.bert_model) # to plot loss during training process self.plt_x = [] self.plt_y = [] random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if self.gpu: torch.cuda.manual_seed_all(seed) def __init_model(self): if self.gpu: self.device = torch.device("cuda") else: self.device = torch.device("cpu") self.model.to(self.device) print(torch.cuda.memory_allocated(self.device)) # log available cuda if self.device.type == 'cuda': print(torch.cuda.get_device_name(0)) print('Memory Usage:') print('Allocated:', round(torch.cuda.memory_allocated(0) / 1024**3, 1), 'GB') print('Cached: ', round(torch.cuda.memory_cached(0) / 1024**3, 1), 'GB') def new_model(self): self.model = BertForSequenceClassification.from_pretrained( self.bert_model, num_labels=self.num_classes) self.__init_model() def load_model(self, path_model, path_config): self.model = BertForSequenceClassification(BertConfig(path_config), num_labels=self.num_classes) self.model.load_state_dict(torch.load(path_model)) self.__init_model() def save_model(self, path_model, path_config, epoch_n, acc, f1): if not os.path.exists(path_model): os.makedirs(path_model) model_save_path = os.path.join( path_model, 'model_{:.4f}_{:.4f}_{:.4f}'.format(epoch_n, acc, f1)) torch.save(self.model.state_dict(), model_save_path) if not os.path.exists(path_config): os.makedirs(path_config) model_config_path = os.path.join(path_config, 'config.cf') with open(model_config_path, 'w') as f: f.write(self.model.config.to_json_string()) def train(self, epochs, batch_size=config.batch_size, lr=config.lr, plot_path=None, model_path=None, config_path=None): model_params = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model_params if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in model_params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] self.optimizer = BertAdam( optimizer_grouped_parameters, lr=lr, warmup=0.1, t_total=int(len(self.train_df) / batch_size) * epochs) nb_tr_steps = 0 train_features = data_reader.convert_examples_to_features( self.train_df, config.MAX_SEQ_LENGTH, self.tokenizer) # create tensor of all features all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # eval dataloader eval_features = data_reader.convert_examples_to_features( self.val_df, config.MAX_SEQ_LENGTH, self.tokenizer) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=batch_size) # class weighting _, counts = np.unique(self.train_df['label'], return_counts=True) class_weights = [sum(counts) / c for c in counts] # assign wight to each input sample example_weights = [class_weights[e] for e in self.train_df['label']] sampler = WeightedRandomSampler(example_weights, len(self.train_df['label'])) train_dataloader = DataLoader(train_data, sampler=sampler, batch_size=batch_size) self.model.train() for e in range(epochs): print("Epoch {}".format(e)) if e is not 0: f1, acc = self.val(eval_dataloader) print("\nF1 score: {}, Accuracy: {}".format(f1, acc)) if model_path is not None and config_path is not None: if e is not 0: self.save_model(model_path, config_path, e, acc, f1) for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = self.model(input_ids, segment_ids, input_mask, label_ids) loss.backward() #if plot_path is not None: # self.plt_y.append(loss.item()) # self.plt_x.append(nb_tr_steps) # self.save_plot(plot_path) nb_tr_steps += 1 self.optimizer.step() self.optimizer.zero_grad() if self.gpu: torch.cuda.empty_cache() def val(self, eval_dataloader, batch_size=config.batch_size): f1, acc = 0, 0 nb_eval_examples = 0 for input_ids, input_mask, segment_ids, gnd_labels in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) with torch.no_grad(): logits = self.model(input_ids, segment_ids, input_mask) predicted_labels = np.argmax(logits.detach().cpu().numpy(), axis=1) acc += np.sum(predicted_labels == gnd_labels.numpy()) tmp_eval_f1 = f1_score(predicted_labels, gnd_labels, average='macro') f1 += tmp_eval_f1 * input_ids.size(0) nb_eval_examples += input_ids.size(0) return f1 / nb_eval_examples, acc / nb_eval_examples def save_plot(self, path): fig, ax = plt.subplots() ax.plot(self.plt_x, self.plt_y) ax.set(xlabel='Training steps', ylabel='Loss') fig.savefig(path) plt.close() def create_test_predictions(self, path): tests_features = data_reader.convert_examples_to_features( self.x_test, [-1] * len(self.test_df), config.MAX_SEQ_LENGTH, self.tokenizer) all_input_ids = torch.tensor([f.input_ids for f in tests_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in tests_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in tests_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in tests_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=16) predictions = [] inverse_labels = {v: k for k, v in LABELS} for input_ids, input_mask, segment_ids, gnd_labels in tqdm( test_dataloader, desc="Evaluating"): input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) with torch.no_grad(): encoded_layers, logits = self.model(input_ids, segment_ids, input_mask) predictions += [ inverse_labels[p] for p in list(np.argmax(logits.detach().cpu().numpy(), axis=1)) ] with open(path, "w") as csv_file: writer = csv.writer(csv_file, delimiter=',') for i, prediction in enumerate(predictions): writer.writerow([int(self.x_test_ids[i]), prediction]) return predictions
class MTDNNModel(object): def __init__(self, opt, state_dict=None, num_train_step=-1): self.config = opt self.updates = state_dict[ 'updates'] if state_dict and 'updates' in state_dict else 0 self.train_loss = AverageMeter() self.network = SANBertNetwork(opt) if state_dict: new_state = set(self.network.state_dict().keys()) for k in list(state_dict['state'].keys()): if k not in new_state: del state_dict['state'][k] for k, v in list(self.network.state_dict().items()): if k not in state_dict['state']: state_dict['state'][k] = v self.network.load_state_dict(state_dict['state']) self.mnetwork = nn.DataParallel( self.network) if opt['multi_gpu_on'] else self.network self.total_param = sum([ p.nelement() for p in self.network.parameters() if p.requires_grad ]) no_decay = [ 'bias', 'gamma', 'beta', 'LayerNorm.bias', 'LayerNorm.weight' ] optimizer_parameters = [{ 'params': [ p for n, p in self.network.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in self.network.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] # note that adamax are modified based on the BERT code if opt['optimizer'] == 'sgd': self.optimizer = optim.sgd(optimizer_parameters, opt['learning_rate'], weight_decay=opt['weight_decay']) elif opt['optimizer'] == 'adamax': self.optimizer = Adamax(optimizer_parameters, opt['learning_rate'], warmup=opt['warmup'], t_total=num_train_step, max_grad_norm=opt['grad_clipping'], schedule=opt['warmup_schedule']) if opt.get('have_lr_scheduler', False): opt['have_lr_scheduler'] = False elif opt['optimizer'] == 'adadelta': self.optimizer = optim.Adadelta(optimizer_parameters, opt['learning_rate'], rho=0.95) elif opt['optimizer'] == 'adam': self.optimizer = Adam(optimizer_parameters, lr=opt['learning_rate'], warmup=opt['warmup'], t_total=num_train_step, max_grad_norm=opt['grad_clipping'], schedule=opt['warmup_schedule']) if opt.get('have_lr_scheduler', False): opt['have_lr_scheduler'] = False else: raise RuntimeError('Unsupported optimizer: %s' % opt['optimizer']) if state_dict and 'optimizer' in state_dict: self.optimizer.load_state_dict(state_dict['optimizer']) if opt.get('have_lr_scheduler', False): if opt.get('scheduler_type', 'rop') == 'rop': self.scheduler = ReduceLROnPlateau(self.optimizer, mode='max', factor=opt['lr_gamma'], patience=3) elif opt.get('scheduler_type', 'rop') == 'exp': self.scheduler = ExponentialLR(self.optimizer, gamma=opt.get('lr_gamma', 0.95)) else: milestones = [ int(step) for step in opt.get('multi_step_lr', '10,20,30').split(',') ] self.scheduler = MultiStepLR(self.optimizer, milestones=milestones, gamma=opt.get('lr_gamma')) else: self.scheduler = None self.ema = None if opt['ema_opt'] > 0: self.ema = EMA(self.config['ema_gamma'], self.network) self.para_swapped = False def setup_ema(self): if self.config['ema_opt']: self.ema.setup() def update_ema(self): if self.config['ema_opt']: self.ema.update() def eval(self): if self.config['ema_opt']: self.ema.swap_parameters() self.para_swapped = True def train(self): if self.para_swapped: self.ema.swap_parameters() self.para_swapped = False def update(self, batch_meta, batch_data): self.network.train() labels = batch_data[batch_meta['label']] soft_labels = None temperature = 1.0 if self.config.get('mkd_opt', 0) > 0 and ('soft_label' in batch_meta): soft_labels = batch_meta['soft_label'] if batch_meta['pairwise']: labels = labels.contiguous().view(-1, batch_meta['pairwise_size'])[:, 0] if self.config['cuda']: y = Variable(labels.cuda(async=True), requires_grad=False) else: y = Variable(labels, requires_grad=False) task_id = batch_meta['task_id'] task_type = batch_meta['task_type'] inputs = batch_data[:batch_meta['input_len']] if len(inputs) == 3: inputs.append(None) inputs.append(None) inputs.append(task_id) logits = self.mnetwork(*inputs) if batch_meta['pairwise']: logits = logits.view(-1, batch_meta['pairwise_size']) if self.config.get('weighted_on', False): if self.config['cuda']: weight = Variable( batch_data[batch_meta['factor']].cuda(async=True)) else: weight = Variable(batch_data[batch_meta['factor']]) if task_type > 0: loss = torch.mean( F.mse_loss(logits.squeeze(), y, reduce=False) * weight) else: loss = torch.mean( F.cross_entropy(logits, y, reduce=False) * weight) if soft_labels is not None: # compute KL label_size = soft_labels.size(1) kd_loss = F.kl_div( F.log_softmax(logits.view(-1, label_size).float(), 1), soft_labels) * label_size loss = loss + kd_loss else: if task_type > 0: loss = F.mse_loss(logits.squeeze(), y) else: loss = F.cross_entropy(logits, y) if soft_labels is not None: # compute KL label_size = soft_labels.size(1) # note that kl_div return element-wised mean, thus it requires to time with the label size # In the pytorch v1.x, it simply uses the flag: reduction='batchmean' # TODO: updated the package to support the latest PyTorch (xiaodl) kd_loss = F.kl_div( F.log_softmax(logits.view(-1, label_size).float(), 1), soft_labels) * label_size loss = loss + kd_loss self.train_loss.update(loss.item(), logits.size(0)) self.optimizer.zero_grad() loss.backward() if self.config['global_grad_clipping'] > 0: torch.nn.utils.clip_grad_norm_(self.network.parameters(), self.config['global_grad_clipping']) self.optimizer.step() self.updates += 1 self.update_ema() def predict(self, batch_meta, batch_data): self.network.eval() task_id = batch_meta['task_id'] task_type = batch_meta['task_type'] inputs = batch_data[:batch_meta['input_len']] if len(inputs) == 3: inputs.append(None) inputs.append(None) inputs.append(task_id) score = self.mnetwork(*inputs) if batch_meta['pairwise']: score = score.contiguous().view(-1, batch_meta['pairwise_size']) if task_type < 1: score = F.softmax(score, dim=1) score = score.data.cpu() score = score.numpy() predict = np.zeros(score.shape, dtype=int) positive = np.argmax(score, axis=1) for idx, pos in enumerate(positive): predict[idx, pos] = 1 predict = predict.reshape(-1).tolist() score = score.reshape(-1).tolist() return score, predict, batch_meta['true_label'] else: if task_type < 1: score = F.softmax(score, dim=1) score = score.data.cpu() score = score.numpy() predict = np.argmax(score, axis=1).tolist() score = score.reshape(-1).tolist() return score, predict, batch_meta['label'] def extract(self, batch_meta, batch_data): self.network.eval() # 'token_id': 0; 'segment_id': 1; 'mask': 2 inputs = batch_data[:3] all_encoder_layers, pooled_output = self.mnetwork.bert(*inputs) return all_encoder_layers, pooled_output def save(self, filename): network_state = dict([(k, v.cpu()) for k, v in self.network.state_dict().items()]) ema_state = dict([ (k, v.cpu()) for k, v in self.ema.model.state_dict().items() ]) if self.ema is not None else dict() params = { 'state': network_state, 'optimizer': self.optimizer.state_dict(), 'ema': ema_state, 'config': self.config, } torch.save(params, filename) logger.info('model saved to {}'.format(filename)) def cuda(self): self.network.cuda() if self.config['ema_opt']: self.ema.cuda()
class RelationModel(object): """ A wrapper class for the training and evaluation of models. """ def __init__(self, opt, batch_num): self.opt = opt self.model = Extraction(opt) self.model.cuda() param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] num_train_optimization_steps = batch_num * (opt['num_epoch'] + 1) self.optimizer = BertAdam(optimizer_grouped_parameters, lr=opt['lr'], warmup=0.1, t_total=num_train_optimization_steps) self.bce = nn.BCELoss(reduction='none') self.ema = layers.EMA(self.model, opt['ema']) self.ema.register() def update(self, batch, i): """ Run a step of forward and backward model update. """ if self.opt['cuda']: inputs = [Variable(torch.LongTensor(b).cuda()) for b in batch[:4]] o_labels = Variable(torch.FloatTensor(batch[4]).cuda()) mask = Variable(torch.FloatTensor(batch[5]).cuda()) # step forward self.model.train() self.optimizer.zero_grad() loss_mask = mask.unsqueeze(-1) o_probs = self.model(inputs, mask) o_probs = o_probs.pow(2) o_loss = self.bce(o_probs, o_labels) # .view(batch_size, seq_len, 2) o_loss = 0.5 * torch.sum(o_loss.mul(loss_mask)) / torch.sum(loss_mask) loss = o_loss # backward loss.backward() self.optimizer.step() self.ema.update() loss_val = loss.data.item() return loss_val def predict_obj_per_instance(self, inputs, mask, user_cuda=None): """ Run forward prediction. If unsort is True, recover the original order of the batch. """ if self.opt['cuda']: if user_cuda == None: inputs = [Variable(torch.LongTensor(b).cuda()) for b in inputs] mask = Variable(torch.FloatTensor(mask).cuda()) else: inputs = [ Variable(torch.LongTensor(b).cuda(user_cuda)) for b in inputs ] mask = Variable(torch.FloatTensor(mask).cuda(user_cuda)) self.model.eval() words, distance_to_s, s_start, s_end = inputs hidden, sentence_rep = self.model.based_encoder(words) o_probs = self.model.o_sublayer(hidden, sentence_rep, distance_to_s, s_start, s_end, mask) o_probs = o_probs.pow(2) o_probs = o_probs.mul(mask.unsqueeze(-1)).data.cpu().numpy() return o_probs def update_lr(self, new_lr): torch_utils.change_lr(self.optimizer, new_lr) def save(self, filename, epoch): params = { 'model': self.model.state_dict(), 'config': self.opt, 'epoch': epoch } try: torch.save(params, filename) print("model saved to {}".format(filename)) except BaseException: print("[Warning: Saving failed... continuing anyway.]") def load(self, filename): try: checkpoint = torch.load(filename) except BaseException: print("Cannot load model from {}".format(filename)) exit() self.model.load_state_dict(checkpoint['model']) self.opt = checkpoint['config']
class MTDNNModel(object): def __init__(self, opt, state_dict=None, num_train_step=-1): self.config = opt self.updates = state_dict[ 'updates'] if state_dict and 'updates' in state_dict else 0 self.local_updates = 0 self.train_loss = AverageMeter() self.network = SANBertNetwork(opt) if state_dict: self.network.load_state_dict(state_dict['state'], strict=False) self.mnetwork = nn.DataParallel( self.network) if opt['multi_gpu_on'] else self.network self.total_param = sum([ p.nelement() for p in self.network.parameters() if p.requires_grad ]) if opt['cuda']: self.network.cuda() no_decay = [ 'bias', 'gamma', 'beta', 'LayerNorm.bias', 'LayerNorm.weight' ] optimizer_parameters = [{ 'params': [ p for n, p in self.network.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in self.network.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] # note that adamax are modified based on the BERT code if opt['optimizer'] == 'sgd': self.optimizer = optim.SGD(optimizer_parameters, opt['learning_rate'], weight_decay=opt['weight_decay']) elif opt['optimizer'] == 'adamax': self.optimizer = Adamax(optimizer_parameters, opt['learning_rate'], warmup=opt['warmup'], t_total=num_train_step, max_grad_norm=opt['grad_clipping'], schedule=opt['warmup_schedule'], weight_decay=opt['weight_decay']) if opt.get('have_lr_scheduler', False): opt['have_lr_scheduler'] = False elif opt['optimizer'] == 'radam': self.optimizer = RAdam(optimizer_parameters, opt['learning_rate'], warmup=opt['warmup'], t_total=num_train_step, max_grad_norm=opt['grad_clipping'], schedule=opt['warmup_schedule'], eps=opt['adam_eps'], weight_decay=opt['weight_decay']) if opt.get('have_lr_scheduler', False): opt['have_lr_scheduler'] = False # The current radam does not support FP16. opt['fp16'] = False elif opt['optimizer'] == 'adadelta': self.optimizer = optim.Adadelta(optimizer_parameters, opt['learning_rate'], rho=0.95) elif opt['optimizer'] == 'adam': self.optimizer = Adam(optimizer_parameters, lr=opt['learning_rate'], warmup=opt['warmup'], t_total=num_train_step, max_grad_norm=opt['grad_clipping'], schedule=opt['warmup_schedule'], weight_decay=opt['weight_decay']) if opt.get('have_lr_scheduler', False): opt['have_lr_scheduler'] = False else: raise RuntimeError('Unsupported optimizer: %s' % opt['optimizer']) if state_dict and 'optimizer' in state_dict: self.optimizer.load_state_dict(state_dict['optimizer']) if opt['fp16']: try: from apex import amp global amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(self.network, self.optimizer, opt_level=opt['fp16_opt_level']) self.network = model self.optimizer = optimizer if opt.get('have_lr_scheduler', False): if opt.get('scheduler_type', 'rop') == 'rop': self.scheduler = ReduceLROnPlateau(self.optimizer, mode='max', factor=opt['lr_gamma'], patience=3) elif opt.get('scheduler_type', 'rop') == 'exp': self.scheduler = ExponentialLR(self.optimizer, gamma=opt.get('lr_gamma', 0.95)) else: milestones = [ int(step) for step in opt.get('multi_step_lr', '10,20,30').split(',') ] self.scheduler = MultiStepLR(self.optimizer, milestones=milestones, gamma=opt.get('lr_gamma')) else: self.scheduler = None self.ema = None if opt['ema_opt'] > 0: self.ema = EMA(self.config['ema_gamma'], self.network) if opt['cuda']: self.ema.cuda() self.para_swapped = False # zero optimizer grad self.optimizer.zero_grad() def setup_ema(self): if self.config['ema_opt']: self.ema.setup() def update_ema(self): if self.config['ema_opt']: self.ema.update() def eval(self): if self.config['ema_opt']: self.ema.swap_parameters() self.para_swapped = True def train(self): if self.para_swapped: self.ema.swap_parameters() self.para_swapped = False def update(self, batch_meta, batch_data): self.network.train() labels = batch_data[batch_meta['label']] soft_labels = None if self.config.get('mkd_opt', 0) > 0 and ('soft_label' in batch_meta): soft_labels = batch_meta['soft_label'] task_type = batch_meta['task_type'] if task_type == TaskType.Span: start = batch_data[batch_meta['start']] end = batch_data[batch_meta['end']] if self.config["cuda"]: start = start.cuda(non_blocking=True) end = end.cuda(non_blocking=True) start.requires_grad = False end.requires_grad = False else: y = labels if task_type == TaskType.Ranking: y = y.contiguous().view(-1, batch_meta['pairwise_size'])[:, 0] if self.config['cuda']: y = y.cuda(non_blocking=True) y.requires_grad = False task_id = batch_meta['task_id'] inputs = batch_data[:batch_meta['input_len']] if len(inputs) == 3: inputs.append(None) inputs.append(None) inputs.append(task_id) if self.config.get('weighted_on', False): if self.config['cuda']: weight = batch_data[batch_meta['factor']].cuda( non_blocking=True) else: weight = batch_data[batch_meta['factor']] if task_type == TaskType.Span: start_logits, end_logits = self.mnetwork(*inputs) ignored_index = start_logits.size(1) start.clamp_(0, ignored_index) end.clamp_(0, ignored_index) if self.config.get('weighted_on', False): loss = torch.mean(F.cross_entropy(start_logits, start, reduce=False) * weight) + \ torch.mean(F.cross_entropy(end_logits, end, reduce=False) * weight) else: loss = F.cross_entropy(start_logits, start, ignore_index=ignored_index) + \ F.cross_entropy(end_logits, end, ignore_index=ignored_index) loss = loss / 2 elif task_type == TaskType.SequenceLabeling: y = y.view(-1) logits = self.mnetwork(*inputs) loss = F.cross_entropy(logits, y, ignore_index=-1) else: logits = self.mnetwork(*inputs) if task_type == TaskType.Ranking: logits = logits.view(-1, batch_meta['pairwise_size']) if self.config.get('weighted_on', False): if task_type == TaskType.Regression: loss = torch.mean( F.mse_loss(logits.squeeze(), y, reduce=False) * weight) else: loss = torch.mean( F.cross_entropy(logits, y, reduce=False) * weight) if soft_labels is not None: # compute KL label_size = soft_labels.size(1) kd_loss = F.kl_div(F.log_softmax( logits.view(-1, label_size).float(), 1), soft_labels, reduction='batchmean') loss = loss + kd_loss else: if task_type == TaskType.Regression: loss = F.mse_loss(logits.squeeze(), y) else: loss = F.cross_entropy(logits, y) if soft_labels is not None: # compute KL label_size = soft_labels.size(1) kd_loss = F.kl_div(F.log_softmax( logits.view(-1, label_size).float(), 1), soft_labels, reduction='batchmean') loss = loss + kd_loss self.train_loss.update(loss.item(), logits.size(0)) # scale loss loss = loss / self.config.get('grad_accumulation_step', 1) if self.config['fp16']: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() self.local_updates += 1 if self.local_updates % self.config.get('grad_accumulation_step', 1) == 0: if self.config['global_grad_clipping'] > 0: if self.config['fp16']: torch.nn.utils.clip_grad_norm_( amp.master_params(self.optimizer), self.config['global_grad_clipping']) else: torch.nn.utils.clip_grad_norm_( self.network.parameters(), self.config['global_grad_clipping']) self.updates += 1 # reset number of the grad accumulation self.optimizer.step() self.optimizer.zero_grad() self.update_ema() def predict(self, batch_meta, batch_data): self.network.eval() task_id = batch_meta['task_id'] task_type = batch_meta['task_type'] inputs = batch_data[:batch_meta['input_len']] if len(inputs) == 3: inputs.append(None) inputs.append(None) inputs.append(task_id) score = self.mnetwork(*inputs) if task_type == TaskType.Ranking: score = score.contiguous().view(-1, batch_meta['pairwise_size']) assert task_type == TaskType.Ranking score = F.softmax(score, dim=1) score = score.data.cpu() score = score.numpy() predict = np.zeros(score.shape, dtype=int) positive = np.argmax(score, axis=1) for idx, pos in enumerate(positive): predict[idx, pos] = 1 predict = predict.reshape(-1).tolist() score = score.reshape(-1).tolist() return score, predict, batch_meta['true_label'] elif task_type == TaskType.SequenceLabeling: mask = batch_data[batch_meta['mask']] score = score.contiguous() score = score.data.cpu() score = score.numpy() predict = np.argmax(score, axis=1).reshape(mask.size()).tolist() valied_lenght = mask.sum(1).tolist() final_predict = [] for idx, p in enumerate(predict): final_predict.append(p[:valied_lenght[idx]]) score = score.reshape(-1).tolist() return score, final_predict, batch_meta['label'] else: if task_type == TaskType.Classification: score = F.softmax(score, dim=1) score = score.data.cpu() score = score.numpy() predict = np.argmax(score, axis=1).tolist() score = score.reshape(-1).tolist() return score, predict, batch_meta['label'] def extract(self, batch_meta, batch_data): self.network.eval() # 'token_id': 0; 'segment_id': 1; 'mask': 2 inputs = batch_data[:3] all_encoder_layers, pooled_output = self.mnetwork.bert(*inputs) return all_encoder_layers, pooled_output def save(self, filename): network_state = dict([(k, v.cpu()) for k, v in self.network.state_dict().items()]) ema_state = dict([ (k, v.cpu()) for k, v in self.ema.model.state_dict().items() ]) if self.ema is not None else dict() params = { 'state': network_state, 'optimizer': self.optimizer.state_dict(), 'ema': ema_state, 'config': self.config, } torch.save(params, filename) logger.info('model saved to {}'.format(filename)) def load(self, checkpoint): model_state_dict = torch.load(checkpoint) if model_state_dict['config']['init_checkpoint'].rsplit('/', 1)[1] != \ self.config['init_checkpoint'].rsplit('/', 1)[1]: logger.error( '*** SANBert network is pretrained on a different Bert Model. Please use that to fine-tune for other tasks. ***' ) sys.exit() self.network.load_state_dict(model_state_dict['state'], strict=False) self.optimizer.load_state_dict(model_state_dict['optimizer']) self.config = model_state_dict['config'] if self.ema: self.ema.model.load_state_dict(model_state_dict['ema']) def cuda(self): self.network.cuda() if self.config['ema_opt']: self.ema.cuda()
def main(): torch.cuda.empty_cache() device_name = tf.test.gpu_device_name() if device_name != '/device:GPU:0': raise SystemError('GPU device not found') print('Found GPU at: {}'.format(device_name)) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() torch.cuda.get_device_name(0) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) #model.cuda() scores_train = [] first_sent_train = [] second_sent_train = [] scores_test = [] first_sent_test = [] second_sent_test = [] sent_pairs = [] with open(train_data_path, encoding='utf-8') as fin: train_data = fin.read().split('\n') train_data = [line for line in train_data if line.strip()] for line in train_data: pair = [] line1 = line.split('\t') if float(line1[4]) <= 2.5: scores_train.append(0) else: scores_train.append(1) first_sent_train.append(line1[5]) second_sent_train.append(line1[6]) pair.append(str(line1[5])) pair.append(str(line1[6])) sent_pairs.append(pair) with open(test_data_path, encoding='utf-8') as fin: test_data = fin.read().split('\n') test_data = [line for line in test_data if line.strip()] for line in test_data: line1 = line.split('\t') if float(line1[4]) <= 2.5: scores_test.append(0) else: scores_test.append(1) first_sent_test.append(line1[5]) second_sent_test.append(line1[6]) pairs_train = [] pairs_test = [] segment_ids_train = [] segment_ids_test = [] tokenized_pairs_train = [] tokenized_pairs_test = [] for sent1, sent2 in zip(first_sent_train, second_sent_train): token1 = tokenizer.tokenize(sent1) token2 = tokenizer.tokenize(sent2) pair_tokens = [] pair_segment_ids = [] pair_tokens.append("[CLS] ") pair_segment_ids.append(0) for t in token1: pair_tokens.append(t) pair_segment_ids.append(0) pair_tokens.append('[SEP]') for t in token2: pair_tokens.append(t) pair_segment_ids.append(1) pair_tokens.append('[SEP]') pair_segment_ids.append(1) tokenized_pairs_train.append(pair_tokens) segment_ids_train.append(pair_segment_ids) for sent1, sent2 in zip(first_sent_test, second_sent_test): token1 = tokenizer.tokenize(sent1) token2 = tokenizer.tokenize(sent2) pair_tokens = [] pair_segment_ids = [] pair_tokens.append("[CLS] ") pair_segment_ids.append(0) for t in token1: pair_tokens.append(t) pair_segment_ids.append(0) pair_tokens.append('[SEP]') for t in token2: pair_tokens.append(t) pair_segment_ids.append(1) pair_tokens.append('[SEP]') pair_segment_ids.append(1) tokenized_pairs_test.append(pair_tokens) segment_ids_test.append(pair_segment_ids) print("the first tokenized pair:") print(tokenized_pairs_train[0]) print("the first segment ids:") print(segment_ids_train[0]) input_ids_train = [ tokenizer.convert_tokens_to_ids(x) for x in tokenized_pairs_train ] input_ids_train = pad_sequences(input_ids_train, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") input_ids_test = [ tokenizer.convert_tokens_to_ids(x) for x in tokenized_pairs_test ] input_ids_test = pad_sequences(input_ids_test, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") segment_ids_train = pad_sequences(segment_ids_train, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") segment_ids_test = pad_sequences(segment_ids_test, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") #encoded = [tokenizer.encode(s, add_special_tokens=True) for s in sent_pairs] #input_ids2 = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in sent_pairs]).unsqueeze(0) attention_masks_train = [] attention_masks_test = [] # Create a mask of 1s for each token followed by 0s for padding for seq in input_ids_train: seq_mask = [float(i > 0) for i in seq] attention_masks_train.append(seq_mask) for seq in input_ids_test: seq_mask = [float(i > 0) for i in seq] attention_masks_test.append(seq_mask) # Convert all of our data into torch tensors, the required datatype for our model train_inputs = torch.tensor(input_ids_train).to(torch.int64) validation_inputs = torch.tensor(input_ids_test).to(torch.int64) train_labels = torch.tensor(scores_train).float() validation_labels = torch.tensor(scores_test).float() train_masks = torch.tensor(attention_masks_train).to(torch.int64) validation_masks = torch.tensor(attention_masks_test).to(torch.int64) segment_ids_train = torch.tensor(segment_ids_train).to(torch.int64) segment_ids_test = torch.tensor(segment_ids_test).to(torch.int64) # Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, # with an iterator the entire dataset does not need to be loaded into memory train_data = TensorDataset(train_inputs, train_masks, train_labels, segment_ids_train) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels, segment_ids_test) validation_sampler = SequentialSampler(validation_data) validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size) #BertPreTrainedModel = BertModel.from_pretrained('bert-base-uncased') model = BertSimilarity.from_pretrained('bert-base-uncased') model = model.cuda() # Set our model to training mode (as opposed to evaluation mode) model.train() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=.1) # Store our loss and accuracy for plotting train_loss_set = [] accuracy = {} # trange is a tqdm wrapper around the normal python range for _ in trange(epochs, desc="Epoch"): # Training # Tracking variables tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 # Train the data for one epoch for step, batch in enumerate(train_dataloader): # Add batch to GPU batch = tuple(t.to(device) for t in batch) # Unpack the inputs from our dataloader b_input_ids, b_input_mask, b_labels, b_segment_ids = batch # Clear out the gradients (by default they accumulate) optimizer.zero_grad() # Forward pass probs = model(b_input_ids, attention_mask=b_input_mask, token_type_ids=b_segment_ids) loss_func = torch.nn.BCELoss() batch_loss = loss_func(probs, b_labels) train_loss_set.append(batch_loss) # Backward pass batch_loss.backward() # Update parameters and take a step using the computed gradient optimizer.step() # Update tracking variables tr_loss += batch_loss nb_tr_examples += b_input_ids.size(0) nb_tr_steps += 1 print("Train loss: {}".format(tr_loss / nb_tr_steps)) accuracy['train_loss'] = tr_loss / nb_tr_steps # Validation # Put model in evaluation mode to evaluate loss on the validation set model.eval() # Tracking variables eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 # Evaluate data for one epoch for batch in validation_dataloader: # Add batch to GPU batch = tuple(t.to(device) for t in batch) # Unpack the inputs from our dataloader b_input_ids, b_input_mask, b_labels, b_segment_ids = batch # Telling the model not to compute or store gradients, saving memory and speeding up validation with torch.no_grad(): # Forward pass, calculate logit predictions sigmoid = model(b_input_ids, attention_mask=b_input_mask, token_type_ids=b_segment_ids) # Move logits and labels to CPU sigmoid = sigmoid.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() tmp_eval_accuracy = flat_accuracy(sigmoid, label_ids) eval_accuracy += tmp_eval_accuracy nb_eval_steps += 1 accuracy['valid_loss'] = eval_accuracy / nb_eval_steps print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps)) print("Saving to output folder") acc_filename = os.path.join(model_save_path, 'accuracy.json') with open(acc_filename, 'w') as f: json.dump(accuracy, f) f.close() train_loss_filename = os.path.join(model_save_path, 'trainloss.txt') with open(train_loss_filename, 'w') as f: f.writelines(train_loss_set) model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(model_save_path) tokenizer.save_pretrained(model_save_path) #storage_client = storage.Client() bucket_name = 'gs://gridspace-tts-data' #bucket = storage_client.get_bucket(bucket_name) cp_to_bucket_cmd = 'cp {} {}'.format(model_save_path, bucket_name) subprocess.check_call(cp_to_bucket_cmd, shell=True)
class MTDNNModel(object): def __init__(self, opt, state_dict=None, num_train_step=-1): self.config = opt self.updates = state_dict[ 'updates'] if state_dict and 'updates' in state_dict else 0 self.local_updates = 0 self.train_loss = AverageMeter() self.network = SANBertNetwork(opt) if state_dict: self.network.load_state_dict(state_dict['state'], strict=False) self.mnetwork = nn.DataParallel( self.network) if opt['multi_gpu_on'] else self.network self.total_param = sum([ p.nelement() for p in self.network.parameters() if p.requires_grad ]) if opt['cuda']: self.network.cuda() optimizer_parameters = self._get_param_groups() self._setup_optim(optimizer_parameters, state_dict, num_train_step) self.para_swapped = False self.optimizer.zero_grad() self._setup_lossmap(self.config) def _get_param_groups(self): no_decay = [ 'bias', 'gamma', 'beta', 'LayerNorm.bias', 'LayerNorm.weight' ] optimizer_parameters = [{ 'params': [ p for n, p in self.network.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in self.network.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] return optimizer_parameters def _setup_optim(self, optimizer_parameters, state_dict=None, num_train_step=-1): if self.config['optimizer'] == 'sgd': self.optimizer = optim.SGD( optimizer_parameters, self.config['learning_rate'], weight_decay=self.config['weight_decay']) elif self.config['optimizer'] == 'adamax': self.optimizer = Adamax(optimizer_parameters, self.config['learning_rate'], warmup=self.config['warmup'], t_total=num_train_step, max_grad_norm=self.config['grad_clipping'], schedule=self.config['warmup_schedule'], weight_decay=self.config['weight_decay']) if self.config.get('have_lr_scheduler', False): self.config['have_lr_scheduler'] = False elif self.config['optimizer'] == 'radam': self.optimizer = RAdam(optimizer_parameters, self.config['learning_rate'], warmup=self.config['warmup'], t_total=num_train_step, max_grad_norm=self.config['grad_clipping'], schedule=self.config['warmup_schedule'], eps=self.config['adam_eps'], weight_decay=self.config['weight_decay']) if self.config.get('have_lr_scheduler', False): self.config['have_lr_scheduler'] = False # The current radam does not support FP16. self.config['fp16'] = False elif self.config['optimizer'] == 'adam': self.optimizer = Adam(optimizer_parameters, lr=self.config['learning_rate'], warmup=self.config['warmup'], t_total=num_train_step, max_grad_norm=self.config['grad_clipping'], schedule=self.config['warmup_schedule'], weight_decay=self.config['weight_decay']) if self.config.get('have_lr_scheduler', False): self.config['have_lr_scheduler'] = False else: raise RuntimeError('Unsupported optimizer: %s' % opt['optimizer']) if state_dict and 'optimizer' in state_dict: self.optimizer.load_state_dict(state_dict['optimizer']) if self.config['fp16']: try: from apex import amp global amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize( self.network, self.optimizer, opt_level=self.config['fp16_opt_level']) self.network = model self.optimizer = optimizer if self.config.get('have_lr_scheduler', False): if self.config.get('scheduler_type', 'rop') == 'rop': self.scheduler = ReduceLROnPlateau( self.optimizer, mode='max', factor=self.config['lr_gamma'], patience=3) elif self.config.get('scheduler_type', 'rop') == 'exp': self.scheduler = ExponentialLR(self.optimizer, gamma=self.config.get( 'lr_gamma', 0.95)) else: milestones = [ int(step) for step in self.config.get( 'multi_step_lr', '10,20,30').split(',') ] self.scheduler = MultiStepLR(self.optimizer, milestones=milestones, gamma=self.config.get('lr_gamma')) else: self.scheduler = None def _setup_lossmap(self, config): loss_types = config['loss_types'] self.task_loss_criterion = [] for idx, cs in enumerate(loss_types): assert cs is not None lc = LOSS_REGISTRY[cs]( name='Loss func of task {}: {}'.format(idx, cs)) self.task_loss_criterion.append(lc) def _setup_kd_lossmap(self, config): loss_types = config['kd_loss_types'] self.kd_task_loss_criterion = [] if config.get('mkd_opt', 0) > 0: for idx, cs in enumerate(loss_types): assert cs is not None lc = LOSS_REGISTRY[cs]( name='Loss func of task {}: {}'.format(idx, cs)) self.kd_task_loss_criterion.append(lc) def train(self): if self.para_swapped: self.para_swapped = False def _to_cuda(self, tensor): if tensor is None: return tensor if isinstance(tensor, list) or isinstance(tensor, tuple): y = [e.cuda(non_blocking=True) for e in tensor] for e in y: e.requires_grad = False else: y = tensor.cuda(non_blocking=True) y.requires_grad = False return y def update(self, batch_meta, batch_data): self.network.train() y = batch_data[batch_meta['label']] soft_labels = None task_type = batch_meta['task_type'] y = self._to_cuda(y) if self.config['cuda'] else y task_id = batch_meta['task_id'] inputs = batch_data[:batch_meta['input_len']] if len(inputs) == 3: inputs.append(None) inputs.append(None) inputs.append(task_id) weight = None if self.config.get('weighted_on', False): if self.config['cuda']: weight = batch_data[batch_meta['factor']].cuda( non_blocking=True) else: weight = batch_data[batch_meta['factor']] logits = self.mnetwork(*inputs) # compute loss loss = 0 if self.task_loss_criterion[task_id] and (y is not None): loss = self.task_loss_criterion[task_id](logits, y, weight, ignore_index=-1) # compute kd loss if self.config.get('mkd_opt', 0) > 0 and ('soft_label' in batch_meta): soft_labels = batch_meta['soft_label'] soft_labels = self._to_cuda( soft_labels) if self.config['cuda'] else soft_labels kd_lc = self.kd_task_loss_criterion[task_id] kd_loss = kd_lc(logits, soft_labels, weight, ignore_index=-1) if kd_lc else 0 loss = loss + kd_loss self.train_loss.update(loss.item(), batch_data[batch_meta['token_id']].size(0)) # scale loss loss = loss / self.config.get('grad_accumulation_step', 1) if self.config['fp16']: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() self.local_updates += 1 if self.local_updates % self.config.get('grad_accumulation_step', 1) == 0: if self.config['global_grad_clipping'] > 0: if self.config['fp16']: torch.nn.utils.clip_grad_norm_( amp.master_params(self.optimizer), self.config['global_grad_clipping']) else: torch.nn.utils.clip_grad_norm_( self.network.parameters(), self.config['global_grad_clipping']) self.updates += 1 # reset number of the grad accumulation self.optimizer.step() self.optimizer.zero_grad() def predict(self, batch_meta, batch_data): self.network.eval() task_id = batch_meta['task_id'] task_type = batch_meta['task_type'] inputs = batch_data[:batch_meta['input_len']] if len(inputs) == 3: inputs.append(None) inputs.append(None) inputs.append(task_id) score = self.mnetwork(*inputs) if task_type == TaskType.Ranking: score = score.contiguous().view(-1, batch_meta['pairwise_size']) assert task_type == TaskType.Ranking score = F.softmax(score, dim=1) score = score.data.cpu() score = score.numpy() predict = np.zeros(score.shape, dtype=int) positive = np.argmax(score, axis=1) for idx, pos in enumerate(positive): predict[idx, pos] = 1 predict = predict.reshape(-1).tolist() score = score.reshape(-1).tolist() return score, predict, batch_meta['true_label'] elif task_type == TaskType.SeqenceLabeling: mask = batch_data[batch_meta['mask']] score = score.contiguous() score = score.data.cpu() score = score.numpy() predict = np.argmax(score, axis=1).reshape(mask.size()).tolist() valied_lenght = mask.sum(1).tolist() final_predict = [] for idx, p in enumerate(predict): final_predict.append(p[:valied_lenght[idx]]) score = score.reshape(-1).tolist() return score, final_predict, batch_meta['label'] else: if task_type == TaskType.Classification: score = F.softmax(score, dim=1) score = score.data.cpu() score = score.numpy() predict = np.argmax(score, axis=1).tolist() score = score.reshape(-1).tolist() return score, predict, batch_meta['label'] def extract(self, batch_meta, batch_data): self.network.eval() # 'token_id': 0; 'segment_id': 1; 'mask': 2 inputs = batch_data[:3] all_encoder_layers, pooled_output = self.mnetwork.bert(*inputs) return all_encoder_layers, pooled_output def save(self, filename): network_state = dict([(k, v.cpu()) for k, v in self.network.state_dict().items()]) params = { 'state': network_state, 'optimizer': self.optimizer.state_dict(), 'config': self.config, } torch.save(params, filename) logger.info('model saved to {}'.format(filename)) def load(self, checkpoint): model_state_dict = torch.load(checkpoint) if model_state_dict['config']['init_checkpoint'].rsplit('/', 1)[1] != \ self.config['init_checkpoint'].rsplit('/', 1)[1]: logger.error( '*** SANBert network is pretrained on a different Bert Model. Please use that to fine-tune for other tasks. ***' ) sys.exit() self.network.load_state_dict(model_state_dict['state'], strict=False) self.optimizer.load_state_dict(model_state_dict['optimizer']) self.config = model_state_dict['config'] def cuda(self): self.network.cuda()
class ClassificationModel: def __init__(self, task, val=0.1, bert_model=BERT_MODEL, gpu=False, seed=0): self.gpu = gpu self.task = task self.bert_model = bert_model self.x_train, self.y_train = load_train_dataset(self.task) self.x_val = np.random.choice(self.x_train, size=(int(val * len(self.x_train)), ), replace=False) self.y_val = np.random.choice(self.y_train, size=(int(val * len(self.x_train)), ), replace=False) self.x_test_ids, self.x_test = load_test_dataset(self.task) self.num_classes = len(TASK_LABELS[task]) self.model = None self.optimizer = None self.tokenizer = BertTokenizer.from_pretrained(self.bert_model) self.plt_x = [] self.plt_y = [] random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if self.gpu: torch.cuda.manual_seed_all(seed) def __init_model(self): if self.gpu: self.device = torch.device("cuda") else: self.device = torch.device("cpu") self.model.to(self.device) print(torch.cuda.memory_allocated(self.device)) def new_model(self): self.model = BertForSequenceClassification.from_pretrained( self.bert_model, num_labels=self.num_classes) self.__init_model() def load_model(self, path_model, path_config): self.model = BertForSequenceClassification(BertConfig(path_config), num_labels=self.num_classes) self.model.load_state_dict(torch.load(path_model)) self.__init_model() def save_model(self, path_model, path_config): torch.save(self.model.state_dict(), path_model) with open(path_config, 'w') as f: f.write(self.model.config.to_json_string()) # noinspection PyArgumentList def train(self, epochs, plot_path, batch_size=32, lr=5e-5, model_path=None, config_path=None): model_params = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model_params if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in model_params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] self.optimizer = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=0.1, t_total=int(len(self.x_train) / batch_size) * epochs) nb_tr_steps = 0 train_features = convert_examples_to_features(self.x_train, self.y_train, MAX_SEQ_LENGTH, self.tokenizer) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) _, counts = np.unique(self.y_train, return_counts=True) class_weights = [sum(counts) / c for c in counts] example_weights = [class_weights[e] for e in self.y_train] sampler = WeightedRandomSampler(example_weights, len(self.y_train)) train_dataloader = DataLoader(train_data, sampler=sampler, batch_size=batch_size) self.model.train() for e in range(epochs): print(f"Epoch {e}") f1, acc = self.val() print(f"\nF1 score: {f1}, Accuracy: {acc}") if model_path is not None and config_path is not None: self.save_model(model_path, config_path) for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = self.model(input_ids, segment_ids, input_mask, label_ids) loss.backward() self.plt_y.append(loss.item()) self.plt_x.append(nb_tr_steps) self.save_plot(plot_path) nb_tr_steps += 1 self.optimizer.step() self.optimizer.zero_grad() if self.gpu: torch.cuda.empty_cache() def val(self, batch_size=32, test=False): eval_features = convert_examples_to_features(self.x_val, self.y_val, MAX_SEQ_LENGTH, self.tokenizer) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=batch_size) f1, acc = 0, 0 nb_eval_examples = 0 for input_ids, input_mask, segment_ids, gnd_labels in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) with torch.no_grad(): logits = self.model(input_ids, segment_ids, input_mask) predicted_labels = np.argmax(logits.detach().cpu().numpy(), axis=1) acc += np.sum(predicted_labels == gnd_labels.numpy()) tmp_eval_f1 = f1_score(predicted_labels, gnd_labels, average='macro') f1 += tmp_eval_f1 * input_ids.size(0) nb_eval_examples += input_ids.size(0) return f1 / nb_eval_examples, acc / nb_eval_examples def save_plot(self, path): import matplotlib.pyplot as plt fig, ax = plt.subplots() ax.plot(self.plt_x, self.plt_y) ax.set(xlabel='Training steps', ylabel='Loss') fig.savefig(path) plt.close() def create_test_predictions(self, path): eval_features = convert_examples_to_features(self.x_test, [-1] * len(self.x_test), MAX_SEQ_LENGTH, self.tokenizer) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=16) predictions = [] inverse_labels = {v: k for k, v in TASK_LABELS[self.task].items()} for input_ids, input_mask, segment_ids, gnd_labels in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) with torch.no_grad(): logits = self.model(input_ids, segment_ids, input_mask) predictions += [ inverse_labels[p] for p in list(np.argmax(logits.detach().cpu().numpy(), axis=1)) ] with open(path, "w") as csv_file: writer = csv.writer(csv_file, delimiter=',') for i, prediction in enumerate(predictions): writer.writerow([int(self.x_test_ids[i]), prediction]) return predictions
class MTDNNModel(object): def __init__(self, opt, state_dict=None, num_train_step=-1): self.config = opt self.updates = state_dict['updates'] if state_dict and 'updates' in state_dict else 0 self.local_updates = 0 self.train_loss = AverageMeter() self.initial_from_local = True if state_dict else False self.network = SANBertNetwork(opt, initial_from_local=self.initial_from_local) if state_dict: missing_keys, unexpected_keys = self.network.load_state_dict(state_dict['state'], strict=False) self.mnetwork = nn.DataParallel(self.network) if opt['multi_gpu_on'] else self.network self.total_param = sum([p.nelement() for p in self.network.parameters() if p.requires_grad]) if opt['cuda']: self.network.cuda() optimizer_parameters = self._get_param_groups() #print(optimizer_parameters) self._setup_optim(optimizer_parameters, state_dict, num_train_step) self.para_swapped = False self.optimizer.zero_grad() self._setup_lossmap(self.config) self._setup_kd_lossmap(self.config) self._setup_adv_lossmap(self.config) self._setup_adv_training(self.config) def _setup_adv_training(self, config): self.adv_teacher = None if config.get('adv_train', False): self.adv_teacher = SmartPerturbation(config['adv_epsilon'], config['multi_gpu_on'], config['adv_step_size'], config['adv_noise_var'], config['adv_p_norm'], config['adv_k'], config['fp16'], config['encoder_type'], loss_map=self.adv_task_loss_criterion) def _get_param_groups(self): no_decay = ['bias', 'gamma', 'beta', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_parameters = [ {'params': [p for n, p in self.network.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in self.network.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] return optimizer_parameters def _setup_optim(self, optimizer_parameters, state_dict=None, num_train_step=-1): ###여기서 Error #print(len(optimizer_parameters[0]['params'])) if self.config['optimizer'] == 'sgd': self.optimizer = optim.SGD(optimizer_parameters, self.config['learning_rate'], weight_decay=self.config['weight_decay']) elif self.config['optimizer'] == 'adamax': self.optimizer = Adamax(optimizer_parameters, self.config['learning_rate'], warmup=self.config['warmup'], t_total=num_train_step, max_grad_norm=self.config['grad_clipping'], schedule=self.config['warmup_schedule'], weight_decay=self.config['weight_decay']) if self.config.get('have_lr_scheduler', False): self.config['have_lr_scheduler'] = False elif self.config['optimizer'] == 'radam': self.optimizer = RAdam(optimizer_parameters, self.config['learning_rate'], warmup=self.config['warmup'], t_total=num_train_step, max_grad_norm=self.config['grad_clipping'], schedule=self.config['warmup_schedule'], eps=self.config['adam_eps'], weight_decay=self.config['weight_decay']) if self.config.get('have_lr_scheduler', False): self.config['have_lr_scheduler'] = False # The current radam does not support FP16. self.config['fp16'] = False elif self.config['optimizer'] == 'adam': self.optimizer = Adam(optimizer_parameters, lr=self.config['learning_rate'], warmup=self.config['warmup'], t_total=num_train_step, max_grad_norm=self.config['grad_clipping'], schedule=self.config['warmup_schedule'], weight_decay=self.config['weight_decay']) if self.config.get('have_lr_scheduler', False): self.config['have_lr_scheduler'] = False else: raise RuntimeError('Unsupported optimizer: %s' % opt['optimizer']) print("="*50) #print(state_dict['optimizer']) if state_dict and 'optimizer' in state_dict: #print("Optimizer's state_dict:") #state_dict['optimizer']['param_groups'][0]['params']=state_dict['optimizer']['param_groups'][0]['params'][:77] #print(len(state_dict['optimizer']['param_groups'][0]['params'])) #for var_name in state_dict['optimizer']: # print(var_name, "\t", state_dict['optimizer'][var_name]) #print(self.optimizer.state_dict()) ###### #state_dict['optimizer'][var_name] = self.optimizer.load_state_dict(state_dict['optimizer']) ###여기서 Error if self.config['fp16']: try: from apex import amp global amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(self.network, self.optimizer, opt_level=self.config['fp16_opt_level']) self.network = model self.optimizer = optimizer if self.config.get('have_lr_scheduler', False): if self.config.get('scheduler_type', 'rop') == 'rop': self.scheduler = ReduceLROnPlateau(self.optimizer, mode='max', factor=self.config['lr_gamma'], patience=3) elif self.config.get('scheduler_type', 'rop') == 'exp': self.scheduler = ExponentialLR(self.optimizer, gamma=self.config.get('lr_gamma', 0.95)) else: milestones = [int(step) for step in self.config.get('multi_step_lr', '10,20,30').split(',')] self.scheduler = MultiStepLR(self.optimizer, milestones=milestones, gamma=self.config.get('lr_gamma')) else: self.scheduler = None def _setup_lossmap(self, config): task_def_list: List[TaskDef] = config['task_def_list'] self.task_loss_criterion = [] for idx, task_def in enumerate(task_def_list): cs = task_def.loss lc = LOSS_REGISTRY[cs](name='Loss func of task {}: {}'.format(idx, cs)) self.task_loss_criterion.append(lc) def _setup_kd_lossmap(self, config): task_def_list: List[TaskDef] = config['task_def_list'] self.kd_task_loss_criterion = [] if config.get('mkd_opt', 0) > 0: for idx, task_def in enumerate(task_def_list): cs = task_def.kd_loss assert cs is not None lc = LOSS_REGISTRY[cs](name='KD Loss func of task {}: {}'.format(idx, cs)) self.kd_task_loss_criterion.append(lc) def _setup_adv_lossmap(self, config): task_def_list: List[TaskDef] = config['task_def_list'] self.adv_task_loss_criterion = [] if config.get('adv_train', False): for idx, task_def in enumerate(task_def_list): cs = task_def.adv_loss assert cs is not None lc = LOSS_REGISTRY[cs](name='Adv Loss func of task {}: {}'.format(idx, cs)) self.adv_task_loss_criterion.append(lc) def train(self): if self.para_swapped: self.para_swapped = False def _to_cuda(self, tensor): if tensor is None: return tensor if isinstance(tensor, list) or isinstance(tensor, tuple): y = [e.cuda(non_blocking=True) for e in tensor] for e in y: e.requires_grad = False else: y = tensor.cuda(non_blocking=True) y.requires_grad = False return y def update(self, batch_meta, batch_data, weight_alpha): #### self.network.train() y = batch_data[batch_meta['label']] y = self._to_cuda(y) if self.config['cuda'] else y task_id = batch_meta['task_id'] inputs = batch_data[:batch_meta['input_len']] if len(inputs) == 3: inputs.append(None) inputs.append(None) inputs.append(task_id) weight = None if self.config['itw_on']: #### if self.config['cuda']: weight = torch.FloatTensor([batch_meta['weight']]).cuda(non_blocking=True)*weight_alpha else: weight = batch_meta['weight']*weight_alpha """ if self.config.get('weighted_on', False): if self.config['cuda']: weight = batch_data[batch_meta['factor']].cuda(non_blocking=True) else: weight = batch_data[batch_meta['factor']] """ # fw to get logits logits = self.mnetwork(*inputs) # compute loss loss = 0 if self.task_loss_criterion[task_id] and (y is not None): loss_criterion = self.task_loss_criterion[task_id] if isinstance(loss_criterion, RankCeCriterion) and batch_meta['pairwise_size'] > 1: # reshape the logits for ranking. loss = self.task_loss_criterion[task_id](logits, y, weight, ignore_index=-1, pairwise_size=batch_meta['pairwise_size']) else: loss = self.task_loss_criterion[task_id](logits, y, weight, ignore_index=-1) # compute kd loss if self.config.get('mkd_opt', 0) > 0 and ('soft_label' in batch_meta): soft_labels = batch_meta['soft_label'] soft_labels = self._to_cuda(soft_labels) if self.config['cuda'] else soft_labels kd_lc = self.kd_task_loss_criterion[task_id] kd_loss = kd_lc(logits, soft_labels, weight, ignore_index=-1) if kd_lc else 0 loss = loss + kd_loss # adv training if self.config.get('adv_train', False) and self.adv_teacher: # task info task_type = batch_meta['task_def']['task_type'] adv_inputs = [self.mnetwork, logits] + inputs + [task_type, batch_meta.get('pairwise_size', 1)] adv_loss = self.adv_teacher.forward(*adv_inputs) loss = loss + self.config['adv_alpha'] * adv_loss self.train_loss.update(loss.item(), batch_data[batch_meta['token_id']].size(0)) # scale loss loss = loss / self.config.get('grad_accumulation_step', 1) if self.config['fp16']: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() self.local_updates += 1 if self.local_updates % self.config.get('grad_accumulation_step', 1) == 0: if self.config['global_grad_clipping'] > 0: if self.config['fp16']: torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.config['global_grad_clipping']) else: torch.nn.utils.clip_grad_norm_(self.network.parameters(), self.config['global_grad_clipping']) self.updates += 1 # reset number of the grad accumulation self.optimizer.step() self.optimizer.zero_grad() def encode(self, batch_meta, batch_data): self.network.eval() inputs = batch_data[:3] sequence_output = self.network.encode(*inputs)[0] return sequence_output # TODO: similar as function extract, preserve since it is used by extractor.py # will remove after migrating to transformers package def extract(self, batch_meta, batch_data): self.network.eval() # 'token_id': 0; 'segment_id': 1; 'mask': 2 inputs = batch_data[:3] all_encoder_layers, pooled_output = self.mnetwork.bert(*inputs) return all_encoder_layers, pooled_output def predict(self, batch_meta, batch_data): self.network.eval() task_id = batch_meta['task_id'] task_def = TaskDef.from_dict(batch_meta['task_def']) task_type = task_def.task_type task_obj = tasks.get_task_obj(task_def) inputs = batch_data[:batch_meta['input_len']] if len(inputs) == 3: inputs.append(None) inputs.append(None) inputs.append(task_id) score = self.mnetwork(*inputs) if task_obj is not None: score, predict = task_obj.test_predict(score) elif task_type == TaskType.Ranking: score = score.contiguous().view(-1, batch_meta['pairwise_size']) assert task_type == TaskType.Ranking score = F.softmax(score, dim=1) score = score.data.cpu() score = score.numpy() predict = np.zeros(score.shape, dtype=int) positive = np.argmax(score, axis=1) for idx, pos in enumerate(positive): predict[idx, pos] = 1 predict = predict.reshape(-1).tolist() score = score.reshape(-1).tolist() return score, predict, batch_meta['true_label'] elif task_type == TaskType.SeqenceLabeling: mask = batch_data[batch_meta['mask']] score = score.contiguous() score = score.data.cpu() score = score.numpy() predict = np.argmax(score, axis=1).reshape(mask.size()).tolist() valied_lenght = mask.sum(1).tolist() final_predict = [] for idx, p in enumerate(predict): final_predict.append(p[: valied_lenght[idx]]) score = score.reshape(-1).tolist() return score, final_predict, batch_meta['label'] elif task_type == TaskType.Span: start, end = score predictions = [] if self.config['encoder_type'] == EncoderModelType.BERT: import experiments.squad.squad_utils as mrc_utils scores, predictions = mrc_utils.extract_answer(batch_meta, batch_data, start, end, self.config.get('max_answer_len', 5), do_lower_case=self.config.get('do_lower_case', False)) return scores, predictions, batch_meta['answer'] else: raise ValueError("Unknown task_type: %s" % task_type) return score, predict, batch_meta['label'] def save(self, filename): network_state = dict([(k, v.cpu()) for k, v in self.network.state_dict().items()]) params = { 'state': network_state, 'optimizer': self.optimizer.state_dict(), 'config': self.config, } torch.save(params, filename) logger.info('model saved to {}'.format(filename)) def load(self, checkpoint): model_state_dict = torch.load(checkpoint) if 'state' in model_state_dict: self.network.load_state_dict(model_state_dict['state'], strict=False) if 'optimizer' in model_state_dict: self.optimizer.load_state_dict(model_state_dict['optimizer']) if 'config' in model_state_dict: self.config.update(model_state_dict['config']) def cuda(self): self.network.cuda()
def model_go(): seed = 12 torch.manual_seed(seed) # bert_model_name = 'bert-large-uncased' bert_model_name = 'bert-base-uncased' lazy = True # lazy = True forward_size = 128 # batch_size = 64 batch_size = 128 gradient_accumulate_step = int(batch_size / forward_size) warmup_proportion = 0.1 learning_rate = 5e-5 num_train_epochs = 3 eval_frequency = 5000 do_lower_case = True ignore_non_verifiable = True doc_filter_value = 0.005 doc_top_k = 5 experiment_name = f'fever_v0_slevel_retri_(ignore_non_verifiable:{ignore_non_verifiable})' debug_mode = False max_l = 128 # est_datasize = 900_000 num_class = 1 # num_train_optimization_steps device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace("false", namespace="labels") # 0 vocab.add_token_to_namespace("true", namespace="labels") # 1 vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Dataset train_upstream_doc_results = common.load_jsonl( config.PRO_ROOT / "data/p_fever/fever_paragraph_level/04-22-15:05:45_fever_v0_plevel_retri_(ignore_non_verifiable:True)/" "i(5000)|e(0)|v02_ofever(0.8947894789478947)|v05_ofever(0.8555355535553555)|seed(12)/fever_p_level_train_results.jsonl" ) dev_upstream_doc_results = common.load_jsonl( config.PRO_ROOT / "data/p_fever/fever_paragraph_level/04-22-15:05:45_fever_v0_plevel_retri_(ignore_non_verifiable:True)/" "i(5000)|e(0)|v02_ofever(0.8947894789478947)|v05_ofever(0.8555355535553555)|seed(12)/fever_p_level_dev_results.jsonl" ) # train_list = common.load_json(config.TRAIN_FILE) dev_list = common.load_jsonl(config.FEVER_DEV) train_fitems = fever_s_level_sampler.get_sentence_forward_pair( 'train', train_upstream_doc_results, is_training=True, debug=debug_mode, ignore_non_verifiable=ignore_non_verifiable, top_k=doc_top_k, filter_value=doc_filter_value) dev_fitems = fever_s_level_sampler.get_sentence_forward_pair( 'dev', dev_upstream_doc_results, is_training=False, debug=debug_mode, ignore_non_verifiable=ignore_non_verifiable, top_k=doc_top_k, filter_value=doc_filter_value) # Just to show the information fever_p_level_sampler.down_sample_neg(train_fitems, None) fever_p_level_sampler.down_sample_neg(dev_fitems, None) if debug_mode: dev_list = dev_list[:100] eval_frequency = 2 # print(dev_list[-1]['_id']) # exit(0) # sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio) est_datasize = len(train_fitems) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, 'id') # print(dev_o_dict) bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case) bert_cs_reader = BertContentSelectionReader( bert_tokenizer, lazy, is_paired=True, example_filter=lambda x: len(x['context']) == 0, max_l=max_l, element_fieldname='element') bert_encoder = BertModel.from_pretrained(bert_model_name) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \ num_train_epochs if debug_mode: num_train_optimization_steps = 100 print("Estimated training size", est_datasize) print("Number of optimization steps:", num_train_optimization_steps) optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=num_train_optimization_steps) dev_instances = bert_cs_reader.read(dev_fitems) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) forbackward_step = 0 update_step = 0 logging_agent = save_tool.ScoreLogger({}) if not debug_mode: # # # Create Log File file_path_prefix, date = save_tool.gen_file_prefix( f"{experiment_name}") # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # # # Log File end for epoch_i in range(num_train_epochs): print("Epoch:", epoch_i) # sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio) random.shuffle(train_fitems) train_instance = bert_cs_reader.read(train_fitems) train_iter = biterator(train_instance, num_epochs=1, shuffle=True) for batch in tqdm(train_iter): model.train() batch = move_to_device(batch, device_num) paired_sequence = batch['paired_sequence'] paired_segments_ids = batch['paired_segments_ids'] labels_ids = batch['label'] att_mask, _ = torch_util.get_length_and_mask(paired_sequence) s1_span = batch['bert_s1_span'] s2_span = batch['bert_s2_span'] loss = model( paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask, mode=BertMultiLayerSeqClassification.ForwardMode.TRAIN, labels=labels_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if gradient_accumulate_step > 1: loss = loss / gradient_accumulate_step loss.backward() forbackward_step += 1 if forbackward_step % gradient_accumulate_step == 0: optimizer.step() optimizer.zero_grad() update_step += 1 if update_step % eval_frequency == 0: print("Update steps:", update_step) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, dev_iter, device_num, make_int=True, with_probs=True) copied_dev_o_dict = copy.deepcopy(dev_o_dict) copied_dev_d_list = copy.deepcopy(dev_list) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th0_5 = select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.5, result_field='predicted_evidence') list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th0_5, 'id', 'predicted_evidence') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_sent_only( copied_dev_d_list, dev_list, max_evidence=5) score_05 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th0_2 = select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.2, result_field='predicted_evidence') list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th0_2, 'id', 'predicted_evidence') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_sent_only( copied_dev_d_list, dev_list, max_evidence=5) score_02 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th0_1 = select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.1, result_field='predicted_evidence') list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th0_1, 'id', 'predicted_evidence') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_sent_only( copied_dev_d_list, dev_list, max_evidence=5) score_01 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } logging_item = { 'score_01': score_01, 'score_02': score_02, 'score_05': score_05, } print(json.dumps(logging_item, indent=2)) s01_ss_score = score_01['ss'] s02_ss_score = score_02['ss'] s05_ss_score = score_05['ss'] if not debug_mode: save_file_name = f'i({update_step})|e({epoch_i})' \ f'|v01_ofever({s01_ss_score})' \ f'|v02_ofever({s02_ss_score})' \ f'|v05_ofever({s05_ss_score})|seed({seed})' common.save_jsonl( cur_eval_results_list, Path(file_path_prefix) / f"{save_file_name}_dev_s_level_results.jsonl") # print(save_file_name) logging_agent.incorporate_results({}, save_file_name, logging_item) logging_agent.logging_to_file( Path(file_path_prefix) / "log.json") model_to_save = model.module if hasattr( model, 'module') else model output_model_file = Path( file_path_prefix) / save_file_name torch.save(model_to_save.state_dict(), str(output_model_file))
class QATrainer(object): def __init__(self): self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") self.model = BertForQuestionAnswering.from_pretrained("bert-base-uncased") train_dir = os.path.join("./save", "qa") self.save_dir = os.path.join(train_dir, "train_%d" % int(time.strftime("%m%d%H%M%S"))) if not os.path.exists(self.save_dir): os.makedirs(self.save_dir) # read data-set and prepare iterator self.train_loader = self.get_data_loader("./squad/train-v1.1.json") self.dev_loader = self.get_data_loader("./squad/new_dev-v1.1.json") num_train_optimization_steps = len(self.train_loader) * config.num_epochs # optimizer param_optimizer = list(self.model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if "pooler" not in n[0]] no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] self.qa_opt = BertAdam(optimizer_grouped_parameters, lr=config.qa_lr, warmup=config.warmup_proportion, t_total=num_train_optimization_steps) # self.qg_lr = config.lr # assign model to device self.model = self.model.to(config.device) def get_data_loader(self, file): train_examples = read_squad_examples(file, is_training=True, debug=config.debug, reduce_size=config.reduce_size) train_features = convert_examples_to_features(train_examples, tokenizer=self.tokenizer, max_seq_length=config.max_seq_len, max_query_length=config.max_query_len, doc_stride=128, is_training=True) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) sampler = RandomSampler(train_data) batch_size = int(config.batch_size / config.gradient_accumulation_steps) train_loader = DataLoader(train_data, sampler=sampler, batch_size=batch_size) return train_loader def save_model(self, loss, epoch): loss = round(loss, 3) dir_name = os.path.join(self.save_dir, "bert_{}_{:.3f}".format(epoch, loss)) if not os.path.exists(dir_name): os.makedirs(dir_name) # save bert model model_to_save = self.model.module if hasattr(self.model, "module") else self.model model_file = os.path.join(dir_name, "pytorch_model.bin") config_file = os.path.join(dir_name, "bert_config.json") state_dict = model_to_save.state_dict() torch.save(state_dict, model_file) model_to_save.config.to_json_file(config_file) def train(self): global_step = 1 batch_num = len(self.train_loader) best_loss = 1e10 qa_loss_lst = [] self.model.train() for epoch in range(1, 4): start = time.time() for step, batch in enumerate(self.train_loader, start=1): input_ids, input_mask, segment_ids, start_positions, end_positions = batch seq_len = torch.sum(torch.sign(input_ids), 1) max_len = torch.max(seq_len) input_ids = input_ids[:, :max_len].to(config.device) input_mask = input_mask[:, : max_len].to(config.device) segment_ids = segment_ids[:, :max_len].to(config.device) start_positions = start_positions.to(config.device) end_positions = end_positions.to(config.device) loss = self.model(input_ids, segment_ids, input_mask, start_positions, end_positions) # mean() to average across multiple gpu and back-propagation loss /= config.gradient_accumulation_steps loss.backward() qa_loss_lst.append(loss) # update params if step % config.gradient_accumulation_steps == 0: self.qa_opt.step() # zero grad self.qa_opt.zero_grad() global_step += 1 avg_qa_loss = sum(qa_loss_lst) # empty list qa_loss_lst = [] msg = "{}/{} {} - ETA : {} - qa_loss: {:.2f}" \ .format(step, batch_num, progress_bar(step, batch_num), eta(start, step, batch_num), avg_qa_loss) print(msg, end="\r") val_loss = self.evaluate(msg) if val_loss <= best_loss: best_loss = val_loss self.save_model(val_loss, epoch) print("Epoch {} took {} - final loss : {:.4f} - val_loss :{:.4f}" .format(epoch, user_friendly_time(time_since(start)), loss, val_loss)) def evaluate(self, msg): self.model.eval() num_val_batches = len(self.dev_loader) val_losses = [] for i, val_data in enumerate(self.dev_loader, start=1): with torch.no_grad(): val_data = tuple(t.to(config.device) for t in val_data) input_ids, input_mask, segment_ids, start_positions, end_positions = val_data val_batch_loss = self.model(input_ids, segment_ids, input_mask, start_positions, end_positions) qa_loss = val_batch_loss val_losses.append(qa_loss.mean().item()) msg2 = "{} => Evaluating :{}/{}".format(msg, i, num_val_batches) print(msg2, end="\r") val_loss = np.mean(val_losses) self.model.train() return val_loss
def train(self): device = torch.device("cuda:0") # pdb.set_trace() if self.debug_mode: self.epochs = 2 print('加载dataloader') # train_loader, valid_loader = self.create_dataloader() train_dataloader, eval_dataloader, train_examples_length, valid_examples_length, eval_features = self.create_dataloader() print('开始训练') num_train_optimization_steps = None if do_train: num_train_optimization_steps = int( train_examples_length / self.batch_size / self.gradient_accumulation_steps) * self.epochs model = BertForSequenceClassification.from_pretrained(self.bert_model_path, cache_dir=None, num_labels=self.num_labels).cuda() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = BertAdam(optimizer_grouped_parameters, lr=self.learning_rate, warmup=self.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 tr_loss = 0 best_F1 = 0 tokenizer = BertTokenizer.from_pretrained(self.bert_model_path, cache_dir=None, do_lower_case=True) model.train() for epoch in range(int(self.epochs)): nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): print('epoch:', epoch, 'batchIndex:', step) batch = tuple(t.to(device) for t in batch) # pdb.set_trace() input_ids, input_mask, segment_ids, label_ids = batch logits = model(input_ids.cuda(), segment_ids.cuda(), input_mask.cuda(), labels=None).cuda() loss_fct = BCEWithLogitsLoss() label_ids = label_ids.cuda() loss = loss_fct(logits.view(-1, 1), label_ids.view(-1, 1)) if self.gradient_accumulation_steps > 1: loss = loss / self.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % self.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() model.zero_grad() global_step += 1 if (step + 1) % self.period == 0: model_to_save = model.module if hasattr(model, 'module') else model model.eval() torch.set_grad_enabled(False) # 开始验证 idx = 0 TP, TN, FN, FP = 0, 0, 0, 0 output = {} for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) batch_size = input_ids.size(0) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) logits = torch.sigmoid(logits) preds = (logits > 0.4).float() preds_numpy = preds.cpu().long().data.numpy() for i in range(idx, idx + batch_size): if eval_features[i].file not in output: output[eval_features[i].file] = {} output[eval_features[i].file][eval_features[i].turn] = preds_numpy[i - idx].tolist() TP, TN, FN, FP = obtain_TP_TN_FN_FP(preds, label_ids, TP, TN, FN, FP) idx += batch_size with open("data/BERT_{}_prediction.json".format(self.test_set), 'w') as f: json.dump(output, f) precision = TP / (TP + FP + 0.001) recall = TP / (TP + FN + 0.001) F1 = 2 * precision * recall / (precision + recall + 0.001) logger.info( "epoch is {} step is {} precision is {} recall is {} F1 is {} best_F1 is {}".format(epoch, step, precision, recall, F1, best_F1)) # F1 = evaluate(args, model, device, processor, label_list, num_labels, tokenizer, output_mode) if F1 > best_F1: output_dir = os.path.join("checkpoints/predictor/", 'save_step_{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) output_model_file = os.path.join(output_dir, WEIGHTS_NAME) output_config_file = os.path.join(output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(output_dir) best_F1 = F1 model.train() # turn on train mode torch.set_grad_enabled(True) # start gradient tracking tr_loss = 0
def train(self): if self.debug_mode: self.epochs = 1 # 加载 dataloader train_loader, valid_loader = self.create_dataloader() # 训练 self.seed_everything() lr = 2e-5 accumulation_steps = math.ceil(self.batch_size / self.base_batch_size) # 预训练 bert 转成 pytorch if os.path.exists(self.bert_model_path + "pytorch_model.bin") is False: convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch( self.bert_model_path + 'bert_model.ckpt', self.bert_model_path + 'bert_config.json', self.bert_model_path + 'pytorch_model.bin') # 加载预训练模型 model = BertNeuralNet.from_pretrained(self.bert_model_path, cache_dir=None) model.zero_grad() model = model.to(self.device) # 不同的参数组设置不同的 weight_decay param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] epoch_steps = int(self.train_len / self.base_batch_size / accumulation_steps) num_train_optimization_steps = int(self.epochs * epoch_steps) valid_every = math.floor(epoch_steps / 10) optimizer = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=0.05, t_total=num_train_optimization_steps) # 渐变学习速率 #scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) # 开始训练 for epoch in range(self.epochs): train_start_time = time.time() model.train() optimizer.zero_grad() # 加载每个 batch 并训练 for i, batch_data in enumerate(train_loader): x_batch = batch_data[0] y_batch = batch_data[1] target_weight_batch = batch_data[2] aux_weight_batch = batch_data[3] identity_weight_batch = batch_data[4] x_mask = batch_data[5] y_pred = model(x_batch, attention_mask=x_mask, labels=None) target_loss, aux_loss, identity_loss = self.custom_loss(y_pred, y_batch, epoch, target_weight_batch, aux_weight_batch, identity_weight_batch) loss = target_loss + aux_loss + identity_loss with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if (i + 1) % accumulation_steps == 0: optimizer.step() optimizer.zero_grad() # 验证 if (i + 1) % valid_every == 0: valid_start_time = time.time() model.eval() y_pred = np.zeros((len(self.train_df) - self.train_len)) for j, valid_batch_data in enumerate(valid_loader): x_batch = valid_batch_data[0] x_mask = valid_batch_data[2] batch_y_pred = self.sigmoid(model(x_batch, attention_mask=x_mask, labels=None).detach().cpu().numpy())[:, 0] y_pred[j * self.base_batch_size: (j + 1) * self.base_batch_size] = batch_y_pred # 计算得分 auc_score = self.evaluator.get_final_metric(y_pred) print("epoch: %d duration: %d min auc_score: %.4f" % (epoch, int((time.time() - train_start_time) / 60), auc_score)) if not self.debug_mode: state_dict = model.state_dict() stage = int((i + 1) / valid_every) train_duration = int((time.time() - train_start_time) / 60) valid_duration = int((time.time() - valid_start_time) / 60) if epoch == 0 and stage == 1: # model[bert][seed][epoch][stage][model_name][stage_train_duration][valid_duration][score].bin model_name = "model/model_%d_%d_%d_%s_%dmin_%dmin_%.4f.bin" % (self.seed, epoch + 1, stage, self.model_name, train_duration, valid_duration, auc_score) else: # model[bert][seed][epoch][stage][model_name][score].bin model_name = "model/model_%d_%d_%d_%s_%.4f.bin" % (self.seed, epoch + 1, stage, self.model_name, auc_score) torch.save(state_dict, os.path.join(self.data_dir, model_name)) model.train() # del 训练相关输入和模型 training_history = [train_loader, valid_loader, model, optimizer, param_optimizer, optimizer_grouped_parameters] for variable in training_history: del variable gc.collect()
opt = model.forward_logits(sentences_s, mask_s, sentences_t, mask_t, event1, event1_mask, event2, event2_mask) opt_mask = model_mask.forward_logits(sentences_s_mask, mask_s, sentences_t, mask_t, event1, event1_mask, event2, event2_mask) opt_mix = torch.cat([opt, opt_mask], dim=-1) logits = model.additional_fc(opt_mix) loss = loss_fn(logits, data_y) torch.nn.utils.clip_grad_norm_(model.parameters(), 1) torch.nn.utils.clip_grad_norm_(model_mask.parameters(), 1) optimizer.zero_grad() optimizer_mask.zero_grad() loss.backward() optimizer.step() optimizer_mask.step() model.eval() model_mask.eval() with torch.no_grad(): predicted_all = [] gold_all = [] for batch, batch_mask in test_dataset_mix: sentences_s, mask_s, sentences_t, mask_t, event1, event1_mask, event2, event2_mask, data_y, _ = batch sentences_s_mask = batch_mask[0] opt = model.forward_logits(sentences_s, mask_s, sentences_t,
targets[:, :1]) #bce_loss_2 = nn.BCEWithLogitsLoss()(data[:,1:],targets[:,2:]) return (bce_loss_1 * loss_weight) #return (bce_loss_1 * loss_weight) + bce_loss_2 tq = tqdm(range(EPOCHS)) for epoch in tq: train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True) avg_loss = 0. avg_accuracy = 0. lossf = None tk0 = tqdm(enumerate(train_loader), total=len(train_loader), leave=False) optimizer.zero_grad() # Bug fix - thanks to @chinhuic for i, (x_batch, y_batch) in tk0: # optimizer.zero_grad() y_pred = model(x_batch.to(device), attention_mask=(x_batch > 0).to(device), labels=None) #loss = F.binary_cross_entropy_with_logits(y_pred,y_batch.to(device)) loss = custom_loss(y_pred, y_batch.to(device)) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if (i + 1 ) % accumulation_steps == 0: # Wait for several backward steps optimizer.step() # Now we can do an optimizer step optimizer.zero_grad()
def train(self): model = self.agent config = self.config work_dir = Path(config['work_dir']) train_iter = 0 save_every_niter = config['save_every_niter'] entropy_reg_weight = config['entropy_reg_weight'] summary_writer = SummaryWriter( os.path.join(config['work_dir'], 'tb_log/train')) max_train_step = config['max_train_step'] save_program_cache_niter = config.get('save_program_cache_niter', 0) freeze_bert_for_niter = config.get('freeze_bert_niter', 0) gradient_accumulation_niter = config.get('gradient_accumulation_niter', 1) use_trainable_sketch_predictor = self.config.get( 'use_trainable_sketch_predictor', False) bert_params = [(p_name, p) for (p_name, p) in model.named_parameters() if 'bert_model' in p_name and p.requires_grad] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] bert_grouped_parameters = [{ 'params': [p for n, p in bert_params if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in bert_params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] bert_optimizer = BertAdam(bert_grouped_parameters, lr=self.config['bert_learning_rate'], warmup=0.1, t_total=max_train_step) # non bert parameters other_params = [ p for n, p in model.named_parameters() if 'bert_model' not in n and p.requires_grad ] other_optimizer = torch.optim.Adam(other_params, lr=0.001) # eval batch loader self.load_dev_environments() dev_iter = nn_util.loop_iter(self.dev_environments, batch_size=self.config['batch_size'], shuffle=True) cum_loss = cum_examples = 0. t1 = time.time() while train_iter < max_train_step: if 'cuda' in self.devices[0].type: torch.cuda.set_device(self.devices[0]) train_iter += 1 other_optimizer.zero_grad() bert_optimizer.zero_grad() train_samples, samples_info = self.train_queue.get() sample_categories = samples_info['category'] dev_batched_envs = next(dev_iter) # get a batch of dev examples # model inference on dev examples dev_samples = model.decode_examples( dev_batched_envs, beam_size=self.config['beam_size']) dev_samples = dev_samples[0] # list of list to list try: queue_size = self.train_queue.qsize() # queue_sizes = [] # for cat in self.categories: # queue_sizes.append(self.queues[cat].qsize()) print( f'[Learner] train_iter={train_iter} train queue size={queue_size}', file=sys.stderr) summary_writer.add_scalar('train_queue_sizes', queue_size, train_iter) except NotImplementedError: pass train_trajectories = [ sample.trajectory for sample in train_samples ] # dev dev_trajectories = [sample.trajectory for sample in dev_samples] # repeat for getting dev grad dev_loss, dev_log_prob = self.forward_single(dev_samples, train_iter, summary_writer, batch_type='dev') # other_optimizer.step() # should we not do this grad_dev_nested = [p.grad for p in other_params] grad_dev = [torch.flatten(g) for g in grad_dev_nested] grad_dev = torch.cat(grad_dev) # print('dev gradient: ', len(grad_dev), grad_dev[0]) # print('log pr dev: ', dev_log_prob) other_optimizer.zero_grad() bert_optimizer.zero_grad() # to save memory, for vertical tableBERT, we partition the training trajectories into small chunks # if isinstance(self.agent.encoder.bert_model, VerticalAttentionTableBert) and 'large' in self.agent.encoder.bert_model.config.base_model_name: # chunk_size = 5 # # dev_chunk_size = 5 # else: # chunk_size = len(train_samples) # dev_chunk_size = len(dev_samples) chunk_size = 1000000000 chunk_num = int(math.ceil(len(train_samples) / chunk_size)) cum_loss = 0. log_pr_catwise_train = torch.zeros((len(self.categories), 1)) if chunk_num > 1: for chunk_id in range(0, chunk_num): train_samples_chunk = train_samples[chunk_size * chunk_id:chunk_size * chunk_id + chunk_size] sample_categories_chunk = sample_categories[ chunk_size * chunk_id:chunk_size * chunk_id + chunk_size] for idx, cat in enumerate(self.categories): cat_indices = [ j for j in range(len(train_samples_chunk)) if sample_categories_chunk[j] == cat ] train_cat_chunk = [ train_samples_chunk[j] for j in cat_indices ] loss_val, log_pr_chunk = self.forward_single( train_cat_chunk, train_iter, summary_writer, batch_type='train') cum_loss += loss_val grad_cat = [p.grad for p in other_params] reward = torch.dot(torch.tensor(grad_dev), torch.tensor(grad_cat)) self.current_psi[idx] = self.current_psi[ idx] + self.config['dds_lr'] * reward * log_pr_chunk grad_multiply_factor = 1 / len(train_samples) for p in self.agent.parameters(): if p.grad is not None: p.grad.data.mul_(grad_multiply_factor) else: for idx, cat in enumerate(self.categories): cat_indices = [ j for j in range(len(train_samples)) if sample_categories[j] == cat ] train_cat = [train_samples[j] for j in cat_indices] if not train_cat: # empty list, no samples from this category print('no samples in current batch for: ', cat) sys.stdout.flush() continue loss_val, log_pr = self.forward_single(train_cat, train_iter, summary_writer, batch_type='train') cum_loss = loss_val * len(train_samples) grad_cat = [p.grad for p in other_params] # ignore bert_params grad_cat = [torch.flatten(g) for g in grad_cat] grad_cat = torch.cat(grad_cat) other_optimizer.step() other_optimizer.zero_grad() # for every cat, fresh gradients # print(type(grad_cat), grad_cat, grad_cat.shape) # print(type(grad_dev), grad_dev, grad_dev.shape) sys.stdout.flush() # t1 = torch.FloatTensor(grad_dev) # t2 = torch.FloatTensor(grad_cat) # print(t1.shape) # sys.stdout.flush() # print(t2.shape) # sys.stdout.flush() reward = torch.dot(grad_dev, grad_cat) / ( torch.norm(grad_cat) * torch.norm(grad_dev)) print('reward: ', reward) sys.stderr.flush() sys.stdout.flush() self.current_psi[idx] = self.current_psi[ idx] + self.config['dds_lr'] * reward * log_pr # clip gradient grad_norm = torch.nn.utils.clip_grad_norm_(other_params, 5.) # cumulative gradient backprop if train_iter % gradient_accumulation_niter == 0: # other_optimizer.step() if train_iter > freeze_bert_for_niter: bert_optimizer.step() elif train_iter == freeze_bert_for_niter: print( f'[Learner] train_iter={train_iter} reset Adam optimizer and start fine-tuning BERT' ) other_optimizer = torch.optim.Adam(other_params, lr=0.001) self.psi_queue.put(self.current_psi) if 'clip_frac' in samples_info: summary_writer.add_scalar('sample_clip_frac', samples_info['clip_frac'], train_iter) # update sketch predictor if use_trainable_sketch_predictor: if 'cuda' in self.devices[1].type: torch.cuda.set_device(self.devices[1]) self.sketch_predictor_trainer.step(train_trajectories, train_iter=train_iter) cum_examples += len(train_samples) self.try_update_model_to_actors(train_iter) if train_iter % save_every_niter == 0: print( f'[Learner] train_iter={train_iter} avg. loss={cum_loss / cum_examples}, ' f'{cum_examples} examples ({cum_examples / (time.time() - t1)} examples/s)', file=sys.stderr) cum_loss = cum_examples = 0. t1 = time.time() # log stats of the program cache program_cache_stat = self.shared_program_cache.stat() summary_writer.add_scalar( 'avg_num_programs_in_cache', program_cache_stat['num_entries'] / program_cache_stat['num_envs'], train_iter) summary_writer.add_scalar('num_programs_in_cache', program_cache_stat['num_entries'], train_iter) if save_program_cache_niter > 0 and train_iter % save_program_cache_niter == 0: program_cache_file = work_dir / 'log' / f'program_cache.iter{train_iter}.json' program_cache = self.shared_program_cache.all_programs() json.dump(program_cache, program_cache_file.open('w'), indent=2)