def __call__( self, sweep_overrides: List[str], job_dir_key: str, job_num: int, job_id: str, singleton_state: Dict[type, Singleton], ) -> JobReturn: # lazy import to ensure plugin discovery remains fast import submitit assert self.config_loader is not None assert self.config is not None assert self.task_function is not None Singleton.set_state(singleton_state) setup_globals() sweep_config = self.config_loader.load_sweep_config( self.config, sweep_overrides) with open_dict(sweep_config.hydra.job) as job: # Populate new job variables job.id = submitit.JobEnvironment().job_id # type: ignore sweep_config.hydra.job.num = job_num return run_job( config=sweep_config, task_function=self.task_function, job_dir_key=job_dir_key, job_subdir_key="hydra.sweep.subdir", )
def log(self, log_data: dict): job_env = submitit.JobEnvironment() # z = {**vars(self._train_cfg), **log_data} save_dir = Path(self._train_cfg.output_dir) os.makedirs(save_dir, exist_ok=True) with open(save_dir / 'log.txt', 'a') as f: f.write(json.dumps(log_data) + '\n')
def _setup_gpu_args(self): import submitit job_env = submitit.JobEnvironment() print(self.args) self.args.machine_rank = job_env.global_rank print(f"Process rank: {job_env.global_rank}")
def _setup_gpu_args(self): import submitit import os job_env = submitit.JobEnvironment() if os.path.basename(self.args.output_dir) != str(job_env.job_id): self.args.output_dir = os.path.join(self.args.output_dir, str(job_env.job_id))
def __call__(self): job_env = submitit.JobEnvironment() os.environ["MASTER_ADDR"] = job_env.hostnames[0] os.environ["MASTER_PORT"] = str(self.port) os.environ["RANK"] = str(job_env.global_rank) os.environ["LOCAL_RANK"] = str(job_env.local_rank) os.environ["WORLD_SIZE"] = str(job_env.num_tasks) setup_distributed(self.cfg_state) self.fun()
def setup_environ(self, args): self.job = False try: import submitit self.job_env = submitit.JobEnvironment() args.logdir = args.logdir.replace('%j', str(self.job_env.job_id)) self.job = True except: self.job_env = None pass
def __call__(self): import submitit environment = submitit.JobEnvironment() node_id = environment.global_rank master_ip = environment.hostnames[0] master_port = self.config.SLURM.PORT_ID self.config.DISTRIBUTED.INIT_METHOD = "tcp" self.config.DISTRIBUTED.RUN_ID = f"{master_ip}:{master_port}" extract_features_and_run_knn(node_id=node_id, config=self.config)
def _setup_gpu_args(self): import submitit from pathlib import Path job_env = submitit.JobEnvironment() self.args.output_dir = Path(str(self.args.output_dir).replace("%j", str(job_env.job_id))) self.args.gpu = job_env.local_rank self.args.rank = job_env.global_rank self.args.world_size = job_env.num_tasks print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
def update_logdir(opt): try: import submitit job_env = submitit.JobEnvironment() opt.logdir = opt.logdir.replace('%j', str(job_env.job_id)) except: print('No job id found') opt.logdir = 'runs/test/' if not os.path.exists(opt.logdir): os.mkdir(opt.logdir)
def checkpoint(self): import submitit job_env = submitit.JobEnvironment() slurm_job_id = job_env.job_id if self.args.resume_job == "": self.args.resume_job = slurm_job_id print("Requeuing ", self.args) empty_trainer = type(self)(self.args) return submitit.helpers.DelayedSubmission(empty_trainer)
def _setup_process_group(self) -> None: job_env = submitit.JobEnvironment() torch.cuda.set_device(job_env.local_rank) torch.distributed.init_process_group( backend=self._cluster_cfg.dist_backend, init_method=self._cluster_cfg.dist_url, world_size=job_env.num_tasks, rank=job_env.global_rank, ) print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
def _setup_gpu_args(self): import submitit import os job_env = submitit.JobEnvironment() self.args.gpu = job_env.local_rank self.args.rank = job_env.global_rank self.args.world_size = job_env.num_tasks if os.path.basename(self.args.output_dir) != str(job_env.job_id): self.args.output_dir = os.path.join(self.args.output_dir, str(job_env.job_id)) print( f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}" )
def checkpoint(self, rm_init=True) -> submitit.helpers.DelayedSubmission: # will be called by submitit in case of preemption job_env = submitit.JobEnvironment() save_dir = osp.join(self._train_cfg.output_dir, str(job_env.job_id)) os.makedirs(save_dir, exist_ok=True) self._state.save(osp.join(save_dir, "checkpoint.pth")) # Trick here: when the job will be requeue, we will use the same init file # but it must not exist when we initialize the process group # so we delete it, but only when this method is called by submitit for requeue if rm_init and osp.exists(self._cluster_cfg.dist_url[7:]): os.remove(self._cluster_cfg.dist_url[7:]) # remove file:// at the beginning # This allow to remove any non-pickable part of the Trainer instance. empty_trainer = Trainer(self._train_cfg, self._cluster_cfg) return submitit.helpers.DelayedSubmission(empty_trainer)
def __call__(self): import submitit environment = submitit.JobEnvironment() node_id = environment.global_rank master_ip = environment.hostnames[0] master_port = self.config.SLURM.PORT_ID self.config.DISTRIBUTED.INIT_METHOD = "tcp" self.config.DISTRIBUTED.RUN_ID = f"{master_ip}:{master_port}" launch_distributed( cfg=self.config, node_id=node_id, engine_name=self.engine_name, hook_generator=default_hook_generator, )
def _eval(self) -> float: print("Start evaluation of the model", flush=True) job_env = submitit.JobEnvironment() args = self._train_cfg eval_dataloader = self._test_loader num_correct = 0 num_total = 0.0 rrs = [] # reciprocal rank self._state.model.eval() for batch in self._test_loader: batch_to_feed = move_to_cuda(batch) with torch.no_grad(): outputs = self._state.model(batch_to_feed) q = outputs['q'] c = outputs['c'] neg_c = outputs['neg_c'] product_in_batch = torch.mm(q, c.t()) product_neg = (q * neg_c).sum(-1).unsqueeze(1) product = torch.cat([product_in_batch, product_neg], dim=-1) target = torch.arange(product.size(0)).to(product.device) ranked = product.argsort(dim=1, descending=True) # MRR idx2rank = ranked.argsort(dim=1) for idx, t in enumerate(target.tolist()): rrs.append(1 / (idx2rank[idx][t].item() + 1)) prediction = product.argmax(-1) pred_res = prediction == target num_total += pred_res.size(0) num_correct += pred_res.sum(0) acc = num_correct / num_total mrr = np.mean(rrs) print(f"evaluated {num_total} examples...", flush=True) print(f"avg. Acc: {acc}", flush=True) print(f'MRR: {mrr}', flush=True) self._state.model.train() return mrr
def run(cfg): if cfg.num_gpus > 1: job_env = submitit.JobEnvironment() rank = job_env.global_rank world_size = job_env.num_tasks if rank != 0: logging.root.handlers = [] try: torch.cuda.set_device(rank) torch.distributed.init_process_group( backend='nccl', init_method="tcp://{}:{}".format('localhost', 10001), world_size=world_size, rank=rank) train(cfg, is_leader=(rank == 0)) except KeyboardInterrupt: pass finally: torch.distributed.destroy_process_group() else: train(cfg, is_leader=True)
def _eval(self) -> float: print("Start evaluation of the model", flush=True) job_env = submitit.JobEnvironment() args = self._train_cfg eval_dataloader = self._test_loader self._state.model.eval() rrs_1, rrs_2 = [], [] # reciprocal rank for batch in tqdm(eval_dataloader): batch_to_feed = move_to_cuda(batch) with torch.no_grad(): outputs = self._state.model(batch_to_feed) eval_results = mhop_eval(outputs, args) _rrs_1, _rrs_2 = eval_results["rrs_1"], eval_results["rrs_2"] rrs_1 += _rrs_1 rrs_2 += _rrs_2 mrr_1 = np.mean(rrs_1) mrr_2 = np.mean(rrs_2) print(f"evaluated {len(rrs_1)} examples...") print(f'MRR-1: {mrr_1}') print(f'MRR-2: {mrr_2}') self._state.model.train() return {"mrr_1": mrr_1, "mrr_2": mrr_2, "mrr_avg": (mrr_1 + mrr_2) / 2}
def distributed_setup(self): if self.cluster_params.get("use_ethernet", False): printc("Forcing ethernet communication", color="CYAN") os.environ["NCCL_SOCKET_IFNAME"] = get_tcp_interface_name( network_interface_type="ethernet") os.environ["NCCL_IB_DISABLE"] = "1" job_env = submitit.JobEnvironment() master_node = job_env.hostnames[0] attrs = ["global_rank", "local_rank", "num_nodes", "num_tasks", "node"] self.distributed = {k: getattr(job_env, k) for k in attrs} self.distributed["master"] = master_node # Init torch.distributed WORLD group printc(f"Running with job_id: {job_env.job_id}", color="CYAN") port = 42000 + (deterministic_hash(job_env.job_id) % 10000) addr = f"tcp://{master_node}:{port}" printc(f"Initializing dist group at {addr}", color="CYAN") dist.init_process_group( init_method=addr, rank=job_env.global_rank, world_size=job_env.num_tasks, backend="nccl", )
def _init_state(self) -> None: """ Initialize the state and load it from an existing checkpoint if any """ job_env = submitit.JobEnvironment() if job_env.global_rank == 0: # config_path = Path(args.save_folder) / str(job_env.job_id) / 'config.json' os.makedirs(self._train_cfg.output_dir, exist_ok=True) config_path = Path(self._train_cfg.output_dir) / 'config.json' with open(config_path, "w") as g: g.write(json.dumps(self._train_cfg._asdict())) print(f"Setting random seed {self._train_cfg.seed}", flush=True) random.seed(self._train_cfg.seed) np.random.seed(self._train_cfg.seed) torch.manual_seed(self._train_cfg.seed) torch.cuda.manual_seed_all(self._train_cfg.seed) print("Create data loaders", flush=True) tokenizer = AutoTokenizer.from_pretrained(self._train_cfg.model_name) collate_fc = partial(mhop_collate, pad_id=tokenizer.pad_token_id) train_set = MhopDataset(tokenizer, self._train_cfg.train_file, self._train_cfg.max_q_len, self._train_cfg.max_q_sp_len, self._train_cfg.max_c_len, train=True) self._train_loader = torch.utils.data.DataLoader(train_set, batch_size=self._train_cfg.train_batch_size, num_workers=self._train_cfg.num_workers, collate_fn=collate_fc, shuffle=True) test_set = MhopDataset(tokenizer, self._train_cfg.predict_file, self._train_cfg.max_q_len, self._train_cfg.max_q_sp_len, self._train_cfg.max_c_len) self._test_loader = torch.utils.data.DataLoader( test_set, batch_size=self._train_cfg.predict_batch_size, num_workers=self._train_cfg.num_workers, collate_fn=collate_fc, pin_memory=True ) print("Create model", flush=True) print(f"Local rank {job_env.local_rank}", flush=True) bert_config = AutoConfig.from_pretrained(self._train_cfg.model_name) if "roberta" in self._train_cfg.model_name: model = RobertaRetriever(bert_config, self._train_cfg) else: model = MhopRetriever(bert_config, self._train_cfg) model.cuda(job_env.local_rank) no_decay = ['bias', 'LayerNorm.weight'] optimizer_parameters = [ {'params': [p for n, p in model.named_parameters() if not any( nd in n for nd in no_decay)], 'weight_decay': self._train_cfg.weight_decay}, {'params': [p for n, p in model.named_parameters() if any( nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = Adam(optimizer_parameters, lr=self._train_cfg.learning_rate, eps=self._train_cfg.adam_epsilon) if self._train_cfg.fp16: model, optimizer = amp.initialize( model, optimizer, opt_level=self._train_cfg.fp16_opt_level) t_total = len(self._train_loader) // self._train_cfg.gradient_accumulation_steps * self._train_cfg.num_train_epochs warmup_steps = t_total * self._train_cfg.warmup_ratio lr_scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total ) model = torch.nn.DataParallel(model) self._state = TrainerState( epoch=0, model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, global_step=0 ) self.tb_logger = SummaryWriter(self._train_cfg.output_dir.replace("logs", "tflogs")) checkpoint_fn = osp.join(self._train_cfg.output_dir, str(job_env.job_id), "checkpoint.pth") # checkpoint_fn = osp.join(self._train_cfg.output_dir, "checkpoint.pth") if os.path.isfile(checkpoint_fn): print(f"Load existing checkpoint from {checkpoint_fn}", flush=True) self._state = TrainerState.load( checkpoint_fn, default=self._state, gpu=job_env.local_rank)
def _train(self) -> Optional[float]: job_env = submitit.JobEnvironment() batch_step = 0 # forward batch count best_metric = 0 train_loss_meter = AverageMeter() print(f"Start training", flush=True) # Start from the loaded epoch start_epoch = self._state.epoch global_step = self._state.global_step for epoch in range(start_epoch, self._train_cfg.num_train_epochs): print(f"Start epoch {epoch}", flush=True) self._state.model.train() self._state.epoch = epoch for batch in self._train_loader: batch_step += 1 batch_inputs = move_to_cuda(batch["net_inputs"]) loss = self._state.model(batch_inputs) if torch.cuda.device_count() > 1: loss = loss.mean() if self._train_cfg.gradient_accumulation_steps > 1: loss = loss / self._train_cfg.gradient_accumulation_steps if self._train_cfg.fp16: with amp.scale_loss(loss, self._state.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() train_loss_meter.update(loss.item()) if (batch_step + 1) % self._train_cfg.gradient_accumulation_steps == 0: if self._train_cfg.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(self._state.optimizer), self._train_cfg.max_grad_norm) else: torch.nn.utils.clip_grad_norm_( self._state.model.parameters(), self._train_cfg.max_grad_norm) self._state.optimizer.step() self._state.lr_scheduler.step() self._state.model.zero_grad() global_step += 1 self._state.global_step = global_step self.tb_logger.add_scalar('batch_train_loss', loss.item(), global_step) self.tb_logger.add_scalar('smoothed_train_loss', train_loss_meter.avg, global_step) if job_env.global_rank == 0: if self._train_cfg.eval_period != -1 and global_step % self._train_cfg.eval_period == 0: metrics = self._eval() for k, v in metrics.items(): self.tb_logger.add_scalar(k, v*100, global_step) score = metrics[self._train_cfg.final_metric] if best_metric < score: print("Saving model with best %s %.2f -> em %.2f" % (self._train_cfg.final_metric, best_metric*100, score*100), flush=True) torch.save(self._state.model.state_dict(), os.path.join(self._train_cfg.output_dir, f"checkpoint_best.pt")) best_metric = score # Checkpoint only on the master if job_env.global_rank == 0: self.checkpoint(rm_init=False) metrics = self._eval() for k, v in metrics.items(): self.tb_logger.add_scalar(k, v*100, global_step) score = metrics[self._train_cfg.final_metric] if best_metric < score: print("Saving model with best %s %.2f -> em %.2f" % (self._train_cfg.final_metric, best_metric*100, score*100), flush=True) torch.save(self._state.model.state_dict(), os.path.join(self._train_cfg.output_dir, f"checkpoint_best.pt")) best_metric = score self.log({ "best_score": best_metric, "curr_score": score, "smoothed_loss": train_loss_meter.avg, "epoch": epoch }) return best_metric
def my_app(cfg: DictConfig): env = submitit.JobEnvironment() log.info(f"Process ID {os.getpid()} executing task {cfg.task}, with {env}") time.sleep(1)
def _init_state(self) -> None: """ Initialize the state and load it from an existing checkpoint if any """ job_env = submitit.JobEnvironment() if job_env.global_rank == 0: # config_path = Path(args.save_folder) / str(job_env.job_id) / 'config.json' os.makedirs(self._train_cfg.output_dir, exist_ok=True) config_path = Path(self._train_cfg.output_dir) / 'config.json' with open(config_path, "w") as g: g.write(json.dumps(self._train_cfg._asdict())) print(f"Setting random seed {self._train_cfg.seed}", flush=True) random.seed(self._train_cfg.seed) np.random.seed(self._train_cfg.seed) torch.manual_seed(self._train_cfg.seed) print("Create data loaders", flush=True) tokenizer = BertTokenizer.from_pretrained( self._train_cfg.bert_model_name) collate_fc = sp_collate train_set = SPDataset(tokenizer, self._train_cfg.train_file, self._train_cfg.max_q_len, self._train_cfg.max_c_len, train=True) # train_sampler = torch.utils.data.distributed.DistributedSampler( # train_set, num_replicas=job_env.num_tasks, rank=job_env.global_rank # ) # self._train_loader = torch.utils.data.DataLoader( # train_set, # batch_size=self._train_cfg.train_batch_size, # num_workers=4, # sampler=train_sampler, collate_fn=collate_fc # ) self._train_loader = torch.utils.data.DataLoader( train_set, batch_size=self._train_cfg.train_batch_size, num_workers=4, collate_fn=collate_fc) test_set = SPDataset(tokenizer, self._train_cfg.predict_file, self._train_cfg.max_q_len, self._train_cfg.max_c_len) self._test_loader = torch.utils.data.DataLoader( test_set, batch_size=self._train_cfg.predict_batch_size, num_workers=4, collate_fn=collate_fc) print( f"Per Node batch_size: {self._train_cfg.train_batch_size // job_env.num_tasks}", flush=True) print("Create model", flush=True) print(f"Local rank {job_env.local_rank}", flush=True) bert_config = BertConfig.from_pretrained( self._train_cfg.bert_model_name) model = BertForRetrieverSP(bert_config, self._train_cfg) model.cuda(job_env.local_rank) no_decay = ['bias', 'LayerNorm.weight'] optimizer_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': self._train_cfg.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_parameters, lr=self._train_cfg.learning_rate) lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5) if self._train_cfg.fp16: model, optimizer = amp.initialize( model, optimizer, opt_level=self._train_cfg.fp16_opt_level) model = torch.nn.DataParallel(model) # self._state = TrainerState(epoch=0, model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, global_step=0) self.tb_logger = SummaryWriter( os.path.join(self._train_cfg.output_dir, "tblog")) checkpoint_fn = osp.join(self._train_cfg.output_dir, str(job_env.job_id), "checkpoint.pth") # checkpoint_fn = osp.join(self._train_cfg.output_dir, "checkpoint.pth") if os.path.isfile(checkpoint_fn): print(f"Load existing checkpoint from {checkpoint_fn}", flush=True) self._state = TrainerState.load(checkpoint_fn, default=self._state, gpu=job_env.local_rank)
def _eval(self) -> dict: print("Start evaluation of the model", flush=True) job_env = submitit.JobEnvironment() args = self._train_cfg eval_dataloader = self._test_loader model = self._state.model model.eval() id2result = collections.defaultdict(list) id2answer = collections.defaultdict(list) id2gold = {} id2goldsp = {} for batch in tqdm(eval_dataloader): batch_to_feed = move_to_cuda(batch["net_inputs"]) batch_qids = batch["qids"] batch_labels = batch["net_inputs"]["label"].view(-1).tolist() with torch.no_grad(): outputs = model(batch_to_feed) scores = outputs["rank_score"] scores = scores.view(-1).tolist() sp_scores = outputs["sp_score"] sp_scores = sp_scores.float().masked_fill(batch_to_feed["sent_offsets"].eq(0), float("-inf")).type_as(sp_scores) batch_sp_scores = sp_scores.sigmoid() # ans_type_predicted = torch.argmax(outputs["ans_type_logits"], dim=1).view(-1).tolist() outs = [outputs["start_logits"], outputs["end_logits"]] for qid, label, score in zip(batch_qids, batch_labels, scores): id2result[qid].append((label, score)) # answer prediction span_scores = outs[0][:, :, None] + outs[1][:, None] max_seq_len = span_scores.size(1) span_mask = np.tril(np.triu(np.ones((max_seq_len, max_seq_len)), 0), args.max_ans_len) span_mask = span_scores.data.new(max_seq_len, max_seq_len).copy_(torch.from_numpy(span_mask)) span_scores_masked = span_scores.float().masked_fill((1 - span_mask[None].expand_as(span_scores)).bool(), -1e10).type_as(span_scores) start_position = span_scores_masked.max(dim=2)[0].max(dim=1)[1] end_position = span_scores_masked.max(dim=2)[1].gather( 1, start_position.unsqueeze(1)).squeeze(1) answer_scores = span_scores_masked.max(dim=2)[0].max(dim=1)[0].tolist() para_offset = batch['para_offsets'] start_position_ = list( np.array(start_position.tolist()) - np.array(para_offset)) end_position_ = list( np.array(end_position.tolist()) - np.array(para_offset)) for idx, qid in enumerate(batch_qids): id2gold[qid] = batch["gold_answer"][idx] id2goldsp[qid] = batch["sp_gold"][idx] rank_score = scores[idx] sp_score = batch_sp_scores[idx].tolist() start = start_position_[idx] end = end_position_[idx] span_score = answer_scores[idx] tok_to_orig_index = batch['tok_to_orig_index'][idx] doc_tokens = batch['doc_tokens'][idx] wp_tokens = batch['wp_tokens'][idx] orig_doc_start = tok_to_orig_index[start] orig_doc_end = tok_to_orig_index[end] orig_tokens = doc_tokens[orig_doc_start:(orig_doc_end + 1)] tok_tokens = wp_tokens[start:end+1] tok_text = " ".join(tok_tokens) tok_text = tok_text.replace(" ##", "") tok_text = tok_text.replace("##", "") tok_text = tok_text.strip() tok_text = " ".join(tok_text.split()) orig_text = " ".join(orig_tokens) pred_str = get_final_text(tok_text, orig_text, do_lower_case=True, verbose_logging=False) pred_sp = [] passages = batch["passages"][idx] for passage, sent_offset in zip(passages, [0, len(passages[0]["sents"])]): for idx, _ in enumerate(passage["sents"]): try: if sp_score[idx + sent_offset] > 0.5: pred_sp.append([passage["title"], idx]) except: continue id2answer[qid].append((pred_str.strip(), rank_score, span_score, pred_sp)) acc = [] for qid, res in id2result.items(): res.sort(key=lambda x: x[1], reverse=True) acc.append(res[0][0] == 1) print(f"evaluated {len(id2result)} questions...", flush=True) print(f'chain ranking em: {np.mean(acc)}', flush=True) best_em, best_f1, best_joint_em, best_joint_f1 = 0, 0, 0, 0 lambdas = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] for lambda_ in lambdas: ems, f1s = [], [] sp_ems, sp_f1s = [], [] joint_ems, joint_f1s = [], [] for qid, res in id2result.items(): ans_res = id2answer[qid] ans_res.sort(key=lambda x: lambda_ * x[1] + (1 - lambda_) * x[2], reverse=True) top_pred = ans_res[0][0] ems.append(exact_match_score(top_pred, id2gold[qid][0])) f1, prec, recall = f1_score(top_pred, id2gold[qid][0]) f1s.append(f1) top_pred_sp = ans_res[0][3] metrics = {'sp_em': 0, 'sp_f1': 0, 'sp_prec': 0, 'sp_recall': 0} update_sp(metrics, top_pred_sp, id2goldsp[qid]) sp_ems.append(metrics['sp_em']) sp_f1s.append(metrics['sp_f1']) # joint metrics joint_prec = prec * metrics["sp_prec"] joint_recall = recall * metrics["sp_recall"] if joint_prec + joint_recall > 0: joint_f1 = 2 * joint_prec * joint_recall / (joint_prec + joint_recall) else: joint_f1 = 0 joint_em = ems[-1] * sp_ems[-1] joint_ems.append(joint_em) joint_f1s.append(joint_f1) if best_joint_f1 < np.mean(joint_f1s): best_joint_f1 = np.mean(joint_f1s) best_joint_em = np.mean(joint_ems) best_f1 = np.mean(f1s) best_em = np.mean(ems) print(f".......Using combination factor {lambda_}......", flush=True) print(f'answer em: {np.mean(ems)}, count: {len(ems)}', flush=True) print(f'answer f1: {np.mean(f1s)}, count: {len(f1s)}', flush=True) print(f'sp em: {np.mean(sp_ems)}, count: {len(sp_ems)}', flush=True) print(f'sp f1: {np.mean(sp_f1s)}, count: {len(sp_f1s)}', flush=True) print(f'joint em: {np.mean(joint_ems)}, count: {len(joint_ems)}', flush=True) print(f'joint f1: {np.mean(joint_f1s)}, count: {len(joint_f1s)}', flush=True) print(f"Best joint EM/F1 from combination {best_em}/{best_f1}", flush=True) model.train() return {"em": best_em, "f1": best_f1, "joint_em": best_joint_em, "joint_f1": best_joint_f1}
def _train(self) -> Optional[float]: job_env = submitit.JobEnvironment() loss_fct = CrossEntropyLoss() batch_step = 0 # forward batch count best_mrr = 0 train_loss_meter = AverageMeter() print(f"Start training", flush=True) # Start from the loaded epoch start_epoch = self._state.epoch global_step = self._state.global_step for epoch in range(start_epoch, self._train_cfg.num_train_epochs): print(f"Start epoch {epoch}", flush=True) self._state.model.train() self._state.epoch = epoch for batch in self._train_loader: batch_step += 1 batch = move_to_cuda(batch) outputs = self._state.model(batch) q = outputs['q'] c = outputs['c'] neg_c = outputs['neg_c'] product_in_batch = torch.mm(q, c.t()) product_neg = (q * neg_c).sum(-1).unsqueeze(1) product = torch.cat([product_in_batch, product_neg], dim=-1) target = torch.arange(product.size(0)).to(product.device) loss = loss_fct(product, target) if self._train_cfg.gradient_accumulation_steps > 1: loss = loss / self._train_cfg.gradient_accumulation_steps if self._train_cfg.fp16: with amp.scale_loss(loss, self._state.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() train_loss_meter.update(loss.item()) self.tb_logger.add_scalar('batch_train_loss', loss.item(), global_step) self.tb_logger.add_scalar('smoothed_train_loss', train_loss_meter.avg, global_step) if (batch_step + 1) % self._train_cfg.gradient_accumulation_steps == 0: if self._train_cfg.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(self._state.optimizer), self._train_cfg.max_grad_norm) else: torch.nn.utils.clip_grad_norm_( self._state.model.parameters(), self._train_cfg.max_grad_norm) self._state.optimizer.step( ) # We have accumulated enought gradients self._state.model.zero_grad() global_step += 1 self._state.global_step = global_step # Checkpoint only on the master # if job_env.global_rank == 0: self.checkpoint(rm_init=False) mrr = self._eval() self.tb_logger.add_scalar('dev_mrr', mrr * 100, epoch) self._state.lr_scheduler.step(mrr) if best_mrr < mrr: print( "Saving model with best MRR %.2f -> MRR %.2f on epoch=%d" % (best_mrr * 100, mrr * 100, epoch)) torch.save( self._state.model.state_dict(), os.path.join(self._train_cfg.output_dir, f"checkpoint_best.pt")) best_mrr = mrr self.log({ "best_mrr": best_mrr, "curr_mrr": mrr, "smoothed_loss": train_loss_meter.avg, "epoch": epoch }) return best_mrr
def main( init_file, path_model="test.t7", result_file='blabla.txt', dataset="CIFAR10", num_classes=10, batch_size=256, attack="PGDL2", eot_samples=1, # 80 noise=None, batch_prediction=1, sigma=0.25, save_image=False): torch.manual_seed(1234) job_env = submitit.JobEnvironment() print(job_env) torch.cuda.set_device(job_env.local_rank) torch.distributed.init_process_group( backend="nccl", init_method=init_file, world_size=job_env.num_tasks, rank=job_env.global_rank, ) if noise is None: batch_prediction = None sigma = None # Load inputs test_loader = load_data(dataset=dataset, datadir="datasets", batch_size_per_gpu=int(batch_size / job_env.num_tasks), job_env=job_env, train_mode=False) num_images = len(test_loader.dataset) # Classifier definition # torch.nn.Module.dump_patches = True # model_load = torch.load(path_model) # Classifier = model_load["net"] ckpt = torch.load(path_model) epoch = ckpt["epoch"] model, _ = getNetwork(net_type="wide-resnet", depth=28, widen_factor=10, dropout=0.3, num_classes=num_classes) model.load_state_dict(ckpt["model_state_dict"]) Classifier = RandModel(model, noise=noise, sigma=sigma) Classifier.cuda(job_env.local_rank) cudnn.benchmark = True Classifier = torch.nn.parallel.DistributedDataParallel( Classifier, device_ids=[job_env.local_rank], output_device=job_env.local_rank) print("Classifier intialized") for i in range(torch.cuda.device_count()): print(torch.cuda.get_device_name(i)) # print(Classifier) Classifier.eval() adversaries = dict() adversaries["CW"] = attacks.CarliniWagnerL2Attack(Classifier, num_classes, learning_rate=0.01, binary_search_steps=9, max_iterations=60, abort_early=True, initial_const=0.001, clip_min=0.0, clip_max=1.) adversaries["EAD"] = attacks.ElasticNetL1Attack(Classifier, num_classes, confidence=0, targeted=False, learning_rate=0.01, binary_search_steps=9, max_iterations=60, abort_early=True, initial_const=1e-3, clip_min=0., clip_max=1., beta=1e-3, decision_rule='EN') adversaries["PGDL1"] = attacks.SparseL1PGDAttack(Classifier, eps=10., nb_iter=40, eps_iter=2 * 10. / 40, rand_init=False, clip_min=0.0, clip_max=1.0, sparsity=0.05, eot_samples=eot_samples) adversaries["PGDLinf"] = attacks.LinfPGDAttack(Classifier, eps=0.031, nb_iter=40, eps_iter=2 * 0.031 / 40, rand_init=True, clip_min=0.0, clip_max=1.0, eot_samples=eot_samples) adversaries["PGDL2"] = attacks.L2PGDAttack(Classifier, eps=2., nb_iter=40, eps_iter=2 * 2. / 40, rand_init=True, clip_min=0.0, clip_max=1.0, eot_samples=eot_samples) adversaries["FGSM"] = attacks.GradientSignAttack(Classifier, loss_fn=None, eps=0.05, clip_min=0., clip_max=1., targeted=False, eot_samples=eot_samples) current_num_input = 0 running_acc = 0 if attack is not None: norms_l1 = [] norms_l2 = [] norms_linf = [] for i, data in enumerate(test_loader, 0): if i > 0 and save_image: break # get the inputs inputs, labels = data inputs, labels = inputs.cuda(job_env.local_rank), labels.cuda( job_env.local_rank) if (i == 0) and save_image and (job_env.global_rank == 0): torchvision.utils.save_image(inputs, "images_nat.jpg", nrow=8, padding=2, normalize=False, range=None, scale_each=False, pad_value=0) if attack is not None: inputs_adv = adversaries[attack].perturb(inputs, labels) norms_l1_batch = get_lp_norm(inputs_adv - inputs, p=1) norms_l2_batch = get_lp_norm(inputs_adv - inputs, p=2) norms_linf_batch = get_lp_norm(inputs_adv - inputs, p=np.inf) norms_l1.append(norms_l1_batch) norms_l2.append(norms_l2_batch) norms_linf.append(norms_linf_batch) inputs = inputs_adv if (i == 0) and save_image and (job_env.global_rank == 0): torchvision.utils.save_image(inputs, "images_adv.jpg", nrow=8, padding=2, normalize=False, range=None, scale_each=False, pad_value=0) with torch.no_grad(): if noise is None: outputs = Classifier(inputs) _, predicted = torch.max(outputs.data, 1) else: outputs = torch.FloatTensor(labels.shape[0], num_classes).cuda() outputs.zero_() for _ in range(batch_prediction): outputs += Classifier(inputs) _, predicted = torch.max(outputs.data, 1) # print statistics running_acc += predicted.eq(labels.data).cpu().sum().numpy() curr_batch_size = inputs.size(0) current_num_input += curr_batch_size print("[", (i + 1) * batch_size, "/", num_images, "] running_acc=", running_acc / current_num_input) running_acc = torch.Tensor([running_acc]).cuda(job_env.local_rank) torch.distributed.all_reduce(running_acc, op=torch.distributed.ReduceOp.SUM) accuracy = (running_acc / num_images).cpu().sum().numpy() print(accuracy) if attack is not None: norms_l1 = torch.cat(norms_l1).view(-1) norms_l2 = torch.cat(norms_l2).view(-1) norms_linf = torch.cat(norms_linf).view(-1) norms_l1_gathered = all_gather(norms_l1) norms_l2_gathered = all_gather(norms_l2) norms_linf_gathered = all_gather(norms_linf) norms_l1_gathered = torch.cat(norms_l1_gathered).view( -1).detach().cpu().numpy() norms_l2_gathered = torch.cat(norms_l2_gathered).view( -1).detach().cpu().numpy() norms_linf_gathered = torch.cat(norms_linf_gathered).view( -1).detach().cpu().numpy() if job_env.global_rank == 0: if attack is not None: np.save(result_file + "_" + attack + "_l1norm", norms_l1_gathered) np.save(result_file + "_" + attack + "_l2norm", norms_l2_gathered) np.save(result_file + "_" + attack + "_linfnorm", norms_linf_gathered) with open(result_file + ".txt", 'a') as f: f.write('{} {} {} {} {} {} {}\n'.format(epoch, dataset, noise, batch_prediction, attack, eot_samples, accuracy)) torch.distributed.barrier() torch.distributed.destroy_process_group() print(job_env.local_rank, job_env.global_rank) return job_env.local_rank, job_env.global_rank
def _init_state(self) -> None: """ Initialize the state and load it from an existing checkpoint if any """ job_env = submitit.JobEnvironment() if job_env.global_rank == 0: # config_path = Path(args.save_folder) / str(job_env.job_id) / 'config.json' os.makedirs(self._train_cfg.output_dir, exist_ok=True) config_path = Path(self._train_cfg.output_dir) / 'config.json' with open(config_path, "w") as g: g.write(json.dumps(self._train_cfg._asdict())) print(f"Setting random seed {self._train_cfg.seed}", flush=True) random.seed(self._train_cfg.seed) np.random.seed(self._train_cfg.seed) torch.manual_seed(self._train_cfg.seed) print("Create data loaders", flush=True) tokenizer = AutoTokenizer.from_pretrained(self._train_cfg.model_name) collate_fc = partial(rank_collate, pad_id=tokenizer.pad_token_id) train_set = RankingDataset(tokenizer, self._train_cfg.train_file, self._train_cfg.max_seq_len, self._train_cfg.max_q_len, train=True) train_sampler = MhopSampler(train_set, num_neg=self._train_cfg.neg_num) batch_size_per_gpu = (1 + self._train_cfg.neg_num) * self._train_cfg.num_q_per_gpu n_gpu = torch.cuda.device_count() print(f"Number of GPUs: {n_gpu}", flush=True) print(f"Batch size per node: {batch_size_per_gpu * n_gpu}", flush=True) self._train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size_per_gpu * n_gpu, num_workers=self._train_cfg.num_workers, collate_fn=collate_fc, sampler=train_sampler) test_set = RankingDataset(tokenizer, self._train_cfg.predict_file, self._train_cfg.max_seq_len, self._train_cfg.max_q_len) self._test_loader = torch.utils.data.DataLoader( test_set, batch_size=self._train_cfg.predict_batch_size, num_workers=self._train_cfg.num_workers, collate_fn=collate_fc ) print("Create model", flush=True) print(f"Local rank {job_env.local_rank}", flush=True) bert_config = AutoConfig.from_pretrained(self._train_cfg.model_name) model = QAModel(bert_config, self._train_cfg) model.cuda(job_env.local_rank) no_decay = ['bias', 'LayerNorm.weight'] optimizer_parameters = [ {'params': [p for n, p in model.named_parameters() if not any( nd in n for nd in no_decay)], 'weight_decay': self._train_cfg.weight_decay}, {'params': [p for n, p in model.named_parameters() if any( nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if self._train_cfg.use_adam: optimizer = optim.Adam(optimizer_parameters, lr=self._train_cfg.learning_rate) else: optimizer = AdamW(optimizer_parameters, lr=self._train_cfg.learning_rate) # lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2) if self._train_cfg.fp16: model, optimizer = amp.initialize( model, optimizer, opt_level=self._train_cfg.fp16_opt_level) t_total = len(self._train_loader) // self._train_cfg.gradient_accumulation_steps * self._train_cfg.num_train_epochs warmup_steps = t_total * self._train_cfg.warmup_ratio lr_scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total ) model = torch.nn.DataParallel(model) self._state = TrainerState( epoch=0, model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, global_step=0 ) self.tb_logger = SummaryWriter(self._train_cfg.output_dir.replace("logs", "tflogs")) checkpoint_fn = osp.join(self._train_cfg.output_dir, str(job_env.job_id), "checkpoint.pth") # checkpoint_fn = osp.join(self._train_cfg.output_dir, "checkpoint.pth") if os.path.isfile(checkpoint_fn): print(f"Load existing checkpoint from {checkpoint_fn}", flush=True) self._state = TrainerState.load( checkpoint_fn, default=self._state, gpu=job_env.local_rank)
def __init__(self, opt, dset=None): logdir = opt.logdir ### try: import submitit job_env = submitit.JobEnvironment() logdir = logdir.replace('%j', str(job_env.job_id)) opt.logdir = logdir except: print('No job id found') ### if opt.ngpus > 1: opt.bSz = opt.bSz * opt.ngpus opt.n_workers = int(min(opt.n_workers * opt.ngpus, 20)) self.opt = opt print(f'Training with opts: {opt}') self.writer = SummaryWriter(logdir) print(f'Log dir: {self.writer.log_dir}') self.writer.add_text('opts', str(opt), 0) # Fix seed if opt.seed: torch.manual_seed(opt.seed) # depending on the chosen architecture adapt training image size if '224' in opt.feat_arch: opt.iSz = 224 print(f'Using iSz: {opt.iSz}') else: print(f'Continuing with iSz: {opt.iSz}') # construct train dataset or use provided one if dset is None: self.traindset = get_dataset(opt.dataset, classDset=True, iSz=opt.iSz) else: self.traindset = dset print(self.traindset) print(self.traindset.classes[0].samples[0]) print('Train dataset class length histogram') print(np.histogram([len(c) for c in self.traindset.classes])) self.ttype = 'IN' if opt.benchmark == 'IN' else 'miniIN' self.traindset.transform = get_transform(self.ttype, phase='train', do_normalize=True, iSz=opt.iSz) print('Train transform: ', self.traindset.transform) # construct dataloader self.init_dataloader(self.traindset) # construct validation/test dataset self.get_val_test_sets() print('val dataset: ', self.valdset) print('test dataset: ', self.testdset) # verify image size assert opt.iSz in [224, 84], f' Got iSz: {opt.iSz}' # construct model self.model = Model(feat_arch=opt.feat_arch, nClasses=len(self.traindset.classes)) if opt.ngpus > 1: self.model = torch.nn.DataParallel(self.model, device_ids=range(opt.ngpus)) print('Using ') self.model.cuda() print(self.model) if opt.steps is None: opt.steps = get_steps(len(self.traindset), bSz=opt.bSz) print(f'Using steps: {opt.steps}') opt.max_iter = opt.steps[-1] # setup optimizer and scheduler self.optimizer = torch.optim.SGD(self.model.parameters(), lr=opt.lr, momentum=opt.momentum, weight_decay=opt.wd, nesterov=opt.nesterov) self.scheduler = MultiStepWarmupLR(self.optimizer, milestones=opt.steps, gamma=opt.gamma, warmup_steps=opt.warmup_steps) self.iteration = 0 self.ims = torch.FloatTensor().cuda() self.targets = torch.LongTensor().cuda() self.best_5shot = 0 self.best_ckpt_file = os.path.join(self.writer.log_dir, 'best_checkpoint.pth') cudnn.benchmark = True print( f'Dataset size: {len(self.traindset)}, bSz: {opt.bSz}, steps: {opt.steps}, len dataloader {len(self.trainloader)}' )
def main(init_file, path_model="model_test/blabla", dataset='ImageNet', num_classes=1000, epochs=200, batch_size=64, resume_epoch=0, save_frequency=2, adversarial_training=None, attack_list=["PGDLinf", "PGDL2"], eot_samples=1, noise=None, sigma=0.25): torch.manual_seed(1234) job_env = submitit.JobEnvironment() print(job_env) torch.cuda.set_device(job_env.local_rank) torch.distributed.init_process_group( backend="nccl", init_method=init_file, world_size=job_env.num_tasks, rank=job_env.global_rank, ) print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}") if not os.path.exists(path_model): os.makedirs(path_model) # Load inputs if dataset == "ImageNet": train_loader = load_data(dataset=dataset, datadir="/datasets01_101/imagenet_full_size/061417/", # to adapt batch_size_per_gpu=int(batch_size/job_env.num_tasks), job_env=job_env, train_mode=True) else: train_loader = load_data(dataset=dataset, datadir="datasets", batch_size_per_gpu=int(batch_size/job_env.num_tasks), job_env=job_env, train_mode=True) num_images = len(train_loader.dataset) # Classifier definition if dataset == "ImageNet": # Classifier = models.resnet18(pretrained=False) Classifier, modelname = getNetwork(net_type='inceptionresnetv2', num_classes=num_classes) else: Classifier, modelname = getNetwork(net_type="wide-resnet", depth=28, widen_factor=10, dropout=0.3, num_classes=num_classes) Classifier.apply(conv_init) Classifier = RandModel(Classifier, noise=noise, sigma=sigma) Classifier.cuda(job_env.local_rank) cudnn.benchmark = True Classifier = torch.nn.parallel.DistributedDataParallel( Classifier, device_ids=[job_env.local_rank], output_device=job_env.local_rank) Classifier.train() print("Classifier initialized") # optimizer and criterion if adversarial_training == "MixMax": criterion = torch.nn.CrossEntropyLoss(reduction="none").cuda(job_env.local_rank) else: criterion = torch.nn.CrossEntropyLoss().cuda(job_env.local_rank) optimizer = torch.optim.SGD( Classifier.parameters(), lr=0.1*batch_size/256, momentum=0.9, weight_decay=5e-4) if dataset != "ImageNet": scheduler = get_scheduler(optimizer, policy="multistep", milestones=[ 60, 120, 160], gamma=0.2) else: scheduler = get_scheduler(optimizer, policy="multistep", milestones=[ 30, 60, 90], gamma=0.2) # resume learning if resume_epoch > 0: if os.path.isfile(path_model): print("=> loading checkpoint '{}'".format(path_model)) checkpoint = torch.load(path_model) Classifier = checkpoint['net'] print("=> loaded checkpoint (epoch {})".format( checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(path_model)) adversaries = dict() adversaries["CW"] = attacks.CarliniWagnerL2Attack(Classifier, num_classes, learning_rate=0.01, binary_search_steps=9, max_iterations=15, abort_early=True, initial_const=0.001, clip_min=0.0, clip_max=1.) adversaries["EAD"] = attacks.ElasticNetL1Attack(Classifier, num_classes, confidence=0, targeted=False, learning_rate=0.01, binary_search_steps=9, max_iterations=60, abort_early=True, initial_const=1e-3, clip_min=0., clip_max=1., beta=1e-3, decision_rule='EN') adversaries["PGDL1"] = attacks.SparseL1PGDAttack(Classifier, eps=10., nb_iter=10, eps_iter=2*10./10, rand_init=False, clip_min=0.0, clip_max=1.0, sparsity=0.05, eot_samples=eot_samples) adversaries["PGDLinf"] = attacks.LinfPGDAttack(Classifier, eps=0.031, nb_iter=10, eps_iter=2*0.031/10, rand_init=True, clip_min=0.0, clip_max=1.0, eot_samples=eot_samples) adversaries["PGDL2"] = attacks.L2PGDAttack(Classifier, eps=2., nb_iter=10, eps_iter=2*2./10, rand_init=True, clip_min=0.0, clip_max=1.0, eot_samples=eot_samples) adversaries["FGSM"] = attacks.GradientSignAttack(Classifier, loss_fn=None, eps=0.05, clip_min=0., clip_max=1., targeted=False, eot_samples=eot_samples) # TO add L1 attacks for epoch in range(epochs): current_num_input = 0 running_loss = 0.0 running_acc = 0 start_time_epoch = time.time() for i, data in enumerate(train_loader, 0): inputs, labels = data inputs, labels = inputs.cuda(job_env.local_rank), labels.cuda(job_env.local_rank) if adversarial_training is None: outputs = Classifier(inputs) optimizer.zero_grad() loss = criterion(outputs, labels) loss.backward() optimizer.step() if adversarial_training == "Single": inputs_adv = adversaries[attack_list[0]].perturb( inputs, labels) outputs = Classifier(inputs_adv) optimizer.zero_grad() loss = criterion(outputs, labels) loss.backward() optimizer.step() elif adversarial_training == "MixMean": loss = 0 for att in attack_list: inputs_adv = adversaries[att].perturb(inputs, labels) outputs = Classifier(inputs_adv) loss += criterion(outputs, labels) loss /= len(attack_list) optimizer.zero_grad() loss.backward() optimizer.step() elif adversarial_training == "MixRand": att = random.choice(attack_list) inputs_adv = adversaries[att].perturb(inputs, labels) outputs = Classifier(inputs_adv) loss = criterion(outputs, labels) optimizer.zero_grad() loss.backward() optimizer.step() elif adversarial_training == "MixMax": loss = torch.zeros_like(labels).float() for att in attack_list: inputs_adv = adversaries[att].perturb(inputs, labels) outputs = Classifier(inputs_adv) l = criterion(outputs, labels).float() loss = torch.max(loss, l) loss = loss.mean() optimizer.zero_grad() loss.backward() optimizer.step() with torch.no_grad(): outputs = Classifier(inputs) _, predicted = torch.max(outputs.data, 1) running_loss += loss.item() running_acc += predicted.eq(labels.data).cpu().sum().numpy() curr_batch_size = inputs.size(0) if i % 5 == 4 print("Epoch :[", epoch+1, "/", epochs, "] [", i*batch_size, "/", num_images, "] Running loss:", running_loss/5, ", Running accuracy:", running_acc/(5*curr_batch_size), " time:", time.time()-start_time_epoch) running_loss = 0.0 running_acc = 0 # save model if ((epoch + 1) % save_frequency == 0) and (job_env.global_rank == 0): state = { 'epoch': epoch + 1, 'model_state_dict': Classifier.state_dict(), } torch.save(state, os.path.join( path_model, "epoch_"+str(epoch+1)+'.t7')) scheduler.step() if __name__ == "__main__": main()
def init_fn(self): job_env = submitit.JobEnvironment() self.train_ds = MixedDataset(self.options, ignore_3d=self.options.ignore_3d, is_train=True) self.model = hmr(config.SMPL_MEAN_PARAMS, pretrained=True).to(self.device) self.model.cuda(job_env.local_rank) self.model = torch.nn.parallel.DistributedDataParallel( self.model, device_ids=[job_env.local_rank], output_device=job_env.local_rank) if self.options.bExemplarMode: lr = 5e-5 * 0.2 else: lr = self.options.lr self.optimizer = torch.optim.Adam( params=self.model.parameters(), # lr=self.options.lr, lr=lr, weight_decay=0) if self.options.bUseSMPLX: #SMPL-X model #No change is required for HMR training. SMPL-X ignores hand and other parts. #SMPL uses 23 joints, while SMPL-X uses 21 joints, automatically ignoring the last two joints of SMPL self.smpl = SMPLX(config.SMPL_MODEL_DIR, batch_size=self.options.batch_size, create_transl=False).to(self.device) else: #Original SMPL self.smpl = SMPL(config.SMPL_MODEL_DIR, batch_size=self.options.batch_size, create_transl=False).to(self.device) # Per-vertex loss on the shape self.criterion_shape = nn.L1Loss().to(self.device) # Keypoint (2D and 3D) loss # No reduction because confidence weighting needs to be applied self.criterion_keypoints = nn.MSELoss(reduction='none').to(self.device) # Loss for SMPL parameter regression self.criterion_regr = nn.MSELoss().to(self.device) self.models_dict = {'model': self.model} self.optimizers_dict = {'optimizer': self.optimizer} self.focal_length = constants.FOCAL_LENGTH # Initialize SMPLify fitting module self.smplify = SMPLify(step_size=1e-2, batch_size=self.options.batch_size, num_iters=self.options.num_smplify_iters, focal_length=self.focal_length) if self.options.pretrained_checkpoint is not None: print(">>> Load Pretrained mode: {}".format( self.options.pretrained_checkpoint)) self.load_pretrained( checkpoint_file=self.options.pretrained_checkpoint) self.backupModel() #This should be called here after loading model # if torch.cuda.device_count() > 1: assert torch.cuda.device_count() > 1 print("Let's use", torch.cuda.device_count(), "GPUs!") # self.model = torch.nn.DataParallel(self.model) #Failed... # self.model.cuda(job_env.local_rank) # Load dictionary of fits self.fits_dict = FitsDict(self.options, self.train_ds) # Create renderer self.renderer = None # Renderer(focal_length=self.focal_length, img_res=self.options.img_res, faces=self.smpl.faces) #debug from torchvision.transforms import Normalize self.de_normalize_img = Normalize(mean=[ -constants.IMG_NORM_MEAN[0] / constants.IMG_NORM_STD[0], -constants.IMG_NORM_MEAN[1] / constants.IMG_NORM_STD[1], -constants.IMG_NORM_MEAN[2] / constants.IMG_NORM_STD[2] ], std=[ 1 / constants.IMG_NORM_STD[0], 1 / constants.IMG_NORM_STD[1], 1 / constants.IMG_NORM_STD[2] ])
def _train(self) -> Optional[float]: job_env = submitit.JobEnvironment() batch_step = 0 # forward batch count best_mrr = 0 train_loss_meter = AverageMeter() print(f"Start training", flush=True) # Start from the loaded epoch start_epoch = self._state.epoch global_step = self._state.global_step for epoch in range(start_epoch, self._train_cfg.num_train_epochs): print(f"Start epoch {epoch}", flush=True) self._state.model.train() self._state.epoch = epoch for batch in self._train_loader: batch_step += 1 batch = move_to_cuda(batch) loss = mhop_loss(self._state.model, batch, self._train_cfg) if self._train_cfg.gradient_accumulation_steps > 1: loss = loss / self._train_cfg.gradient_accumulation_steps if self._train_cfg.fp16: with amp.scale_loss(loss, self._state.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() train_loss_meter.update(loss.item()) if (batch_step + 1) % self._train_cfg.gradient_accumulation_steps == 0: if self._train_cfg.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(self._state.optimizer), self._train_cfg.max_grad_norm) else: torch.nn.utils.clip_grad_norm_( self._state.model.parameters(), self._train_cfg.max_grad_norm) self._state.optimizer.step() self._state.lr_scheduler.step() self._state.model.zero_grad() global_step += 1 self._state.global_step = global_step self.tb_logger.add_scalar('batch_train_loss', loss.item(), global_step) self.tb_logger.add_scalar('smoothed_train_loss', train_loss_meter.avg, global_step) # Checkpoint only on the master # if job_env.global_rank == 0: self.checkpoint(rm_init=False) mrrs = self._eval() mrr = mrrs["mrr_avg"] self.tb_logger.add_scalar('dev_mrr', mrr*100, epoch) self._state.lr_scheduler.step(mrr) if best_mrr < mrr: print("Saving model with best MRR %.2f -> MRR %.2f on epoch=%d" % (best_mrr*100, mrr*100, epoch)) torch.save(self._state.model.state_dict(), os.path.join(self._train_cfg.output_dir, f"checkpoint_best.pt")) best_mrr = mrr self.log({ "best_mrr": best_mrr, "curr_mrr": mrr, "smoothed_loss": train_loss_meter.avg, "epoch": epoch }) return best_mrr