Example #1
0
    def __call__(
        self,
        sweep_overrides: List[str],
        job_dir_key: str,
        job_num: int,
        job_id: str,
        singleton_state: Dict[type, Singleton],
    ) -> JobReturn:
        # lazy import to ensure plugin discovery remains fast
        import submitit

        assert self.config_loader is not None
        assert self.config is not None
        assert self.task_function is not None

        Singleton.set_state(singleton_state)
        setup_globals()
        sweep_config = self.config_loader.load_sweep_config(
            self.config, sweep_overrides)

        with open_dict(sweep_config.hydra.job) as job:
            # Populate new job variables
            job.id = submitit.JobEnvironment().job_id  # type: ignore
            sweep_config.hydra.job.num = job_num

        return run_job(
            config=sweep_config,
            task_function=self.task_function,
            job_dir_key=job_dir_key,
            job_subdir_key="hydra.sweep.subdir",
        )
 def log(self, log_data: dict):
     job_env = submitit.JobEnvironment()
     # z = {**vars(self._train_cfg), **log_data}
     save_dir = Path(self._train_cfg.output_dir)
     os.makedirs(save_dir, exist_ok=True)
     with open(save_dir / 'log.txt', 'a') as f:
         f.write(json.dumps(log_data) + '\n')
Example #3
0
    def _setup_gpu_args(self):
        import submitit

        job_env = submitit.JobEnvironment()
        print(self.args)

        self.args.machine_rank = job_env.global_rank
        print(f"Process rank: {job_env.global_rank}")
Example #4
0
    def _setup_gpu_args(self):
        import submitit
        import os

        job_env = submitit.JobEnvironment()
        if os.path.basename(self.args.output_dir) != str(job_env.job_id):
            self.args.output_dir = os.path.join(self.args.output_dir,
                                                str(job_env.job_id))
Example #5
0
 def __call__(self):
     job_env = submitit.JobEnvironment()
     os.environ["MASTER_ADDR"] = job_env.hostnames[0]
     os.environ["MASTER_PORT"] = str(self.port)
     os.environ["RANK"] = str(job_env.global_rank)
     os.environ["LOCAL_RANK"] = str(job_env.local_rank)
     os.environ["WORLD_SIZE"] = str(job_env.num_tasks)
     setup_distributed(self.cfg_state)
     self.fun()
Example #6
0
 def setup_environ(self, args):
     self.job = False
     try:
         import submitit
         self.job_env = submitit.JobEnvironment()
         args.logdir = args.logdir.replace('%j', str(self.job_env.job_id))
         self.job = True
     except:
         self.job_env = None
         pass
Example #7
0
    def __call__(self):
        import submitit

        environment = submitit.JobEnvironment()
        node_id = environment.global_rank
        master_ip = environment.hostnames[0]
        master_port = self.config.SLURM.PORT_ID
        self.config.DISTRIBUTED.INIT_METHOD = "tcp"
        self.config.DISTRIBUTED.RUN_ID = f"{master_ip}:{master_port}"
        extract_features_and_run_knn(node_id=node_id, config=self.config)
Example #8
0
    def _setup_gpu_args(self):
        import submitit
        from pathlib import Path

        job_env = submitit.JobEnvironment()
        self.args.output_dir = Path(str(self.args.output_dir).replace("%j", str(job_env.job_id)))
        self.args.gpu = job_env.local_rank
        self.args.rank = job_env.global_rank
        self.args.world_size = job_env.num_tasks
        print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
Example #9
0
def update_logdir(opt):
    try:
        import submitit
        job_env = submitit.JobEnvironment()
        opt.logdir = opt.logdir.replace('%j', str(job_env.job_id))
    except:
        print('No job id found')
        opt.logdir = 'runs/test/'
    if not os.path.exists(opt.logdir):
        os.mkdir(opt.logdir)
Example #10
0
    def checkpoint(self):
        import submitit

        job_env = submitit.JobEnvironment()
        slurm_job_id = job_env.job_id
        if self.args.resume_job == "":
            self.args.resume_job = slurm_job_id
        print("Requeuing ", self.args)
        empty_trainer = type(self)(self.args)
        return submitit.helpers.DelayedSubmission(empty_trainer)
 def _setup_process_group(self) -> None:
     job_env = submitit.JobEnvironment()
     torch.cuda.set_device(job_env.local_rank)
     torch.distributed.init_process_group(
         backend=self._cluster_cfg.dist_backend,
         init_method=self._cluster_cfg.dist_url,
         world_size=job_env.num_tasks,
         rank=job_env.global_rank,
     )
     print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
Example #12
0
    def _setup_gpu_args(self):
        import submitit
        import os

        job_env = submitit.JobEnvironment()
        self.args.gpu = job_env.local_rank
        self.args.rank = job_env.global_rank
        self.args.world_size = job_env.num_tasks
        if os.path.basename(self.args.output_dir) != str(job_env.job_id):
            self.args.output_dir = os.path.join(self.args.output_dir,
                                                str(job_env.job_id))
        print(
            f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}"
        )
    def checkpoint(self, rm_init=True) -> submitit.helpers.DelayedSubmission:
        # will be called by submitit in case of preemption
        job_env = submitit.JobEnvironment()
        save_dir = osp.join(self._train_cfg.output_dir, str(job_env.job_id))
        os.makedirs(save_dir, exist_ok=True)
        self._state.save(osp.join(save_dir, "checkpoint.pth"))

        # Trick here: when the job will be requeue, we will use the same init file
        # but it must not exist when we initialize the process group
        # so we delete it, but only when this method is called by submitit for requeue
        if rm_init and osp.exists(self._cluster_cfg.dist_url[7:]):
            os.remove(self._cluster_cfg.dist_url[7:])  # remove file:// at the beginning
        # This allow to remove any non-pickable part of the Trainer instance.
        empty_trainer = Trainer(self._train_cfg, self._cluster_cfg)
        return submitit.helpers.DelayedSubmission(empty_trainer)
Example #14
0
    def __call__(self):
        import submitit

        environment = submitit.JobEnvironment()
        node_id = environment.global_rank
        master_ip = environment.hostnames[0]
        master_port = self.config.SLURM.PORT_ID
        self.config.DISTRIBUTED.INIT_METHOD = "tcp"
        self.config.DISTRIBUTED.RUN_ID = f"{master_ip}:{master_port}"
        launch_distributed(
            cfg=self.config,
            node_id=node_id,
            engine_name=self.engine_name,
            hook_generator=default_hook_generator,
        )
    def _eval(self) -> float:
        print("Start evaluation of the model", flush=True)
        job_env = submitit.JobEnvironment()
        args = self._train_cfg
        eval_dataloader = self._test_loader
        num_correct = 0
        num_total = 0.0
        rrs = []  # reciprocal rank
        self._state.model.eval()
        for batch in self._test_loader:
            batch_to_feed = move_to_cuda(batch)
            with torch.no_grad():
                outputs = self._state.model(batch_to_feed)
                q = outputs['q']
                c = outputs['c']
                neg_c = outputs['neg_c']

                product_in_batch = torch.mm(q, c.t())
                product_neg = (q * neg_c).sum(-1).unsqueeze(1)
                product = torch.cat([product_in_batch, product_neg], dim=-1)

                target = torch.arange(product.size(0)).to(product.device)
                ranked = product.argsort(dim=1, descending=True)

                # MRR
                idx2rank = ranked.argsort(dim=1)
                for idx, t in enumerate(target.tolist()):
                    rrs.append(1 / (idx2rank[idx][t].item() + 1))

                prediction = product.argmax(-1)
                pred_res = prediction == target

                num_total += pred_res.size(0)
                num_correct += pred_res.sum(0)

        acc = num_correct / num_total
        mrr = np.mean(rrs)
        print(f"evaluated {num_total} examples...", flush=True)
        print(f"avg. Acc: {acc}", flush=True)
        print(f'MRR: {mrr}', flush=True)
        self._state.model.train()
        return mrr
def run(cfg):
    if cfg.num_gpus > 1:
        job_env = submitit.JobEnvironment()
        rank = job_env.global_rank
        world_size = job_env.num_tasks
        if rank != 0:
            logging.root.handlers = []
        try:
            torch.cuda.set_device(rank)
            torch.distributed.init_process_group(
                backend='nccl',
                init_method="tcp://{}:{}".format('localhost', 10001),
                world_size=world_size,
                rank=rank)
            train(cfg, is_leader=(rank == 0))
        except KeyboardInterrupt:
            pass
        finally:
            torch.distributed.destroy_process_group()
    else:
        train(cfg, is_leader=True)
 def _eval(self) -> float:
     print("Start evaluation of the model", flush=True)
     job_env = submitit.JobEnvironment()
     args = self._train_cfg
     eval_dataloader = self._test_loader
     self._state.model.eval()
     rrs_1, rrs_2 = [], [] # reciprocal rank
     for batch in tqdm(eval_dataloader):
         batch_to_feed = move_to_cuda(batch)
         with torch.no_grad():
             outputs = self._state.model(batch_to_feed)
             eval_results = mhop_eval(outputs, args)
             _rrs_1, _rrs_2 = eval_results["rrs_1"], eval_results["rrs_2"]
             rrs_1 += _rrs_1
             rrs_2 += _rrs_2
     mrr_1 = np.mean(rrs_1)
     mrr_2 = np.mean(rrs_2)
     print(f"evaluated {len(rrs_1)} examples...")
     print(f'MRR-1: {mrr_1}')
     print(f'MRR-2: {mrr_2}')
     self._state.model.train()
     return {"mrr_1": mrr_1, "mrr_2": mrr_2, "mrr_avg": (mrr_1 + mrr_2) / 2}
Example #18
0
    def distributed_setup(self):
        if self.cluster_params.get("use_ethernet", False):
            printc("Forcing ethernet communication", color="CYAN")
            os.environ["NCCL_SOCKET_IFNAME"] = get_tcp_interface_name(
                network_interface_type="ethernet")
            os.environ["NCCL_IB_DISABLE"] = "1"

        job_env = submitit.JobEnvironment()
        master_node = job_env.hostnames[0]
        attrs = ["global_rank", "local_rank", "num_nodes", "num_tasks", "node"]
        self.distributed = {k: getattr(job_env, k) for k in attrs}
        self.distributed["master"] = master_node
        # Init torch.distributed WORLD group
        printc(f"Running with job_id: {job_env.job_id}", color="CYAN")
        port = 42000 + (deterministic_hash(job_env.job_id) % 10000)
        addr = f"tcp://{master_node}:{port}"
        printc(f"Initializing dist group at {addr}", color="CYAN")
        dist.init_process_group(
            init_method=addr,
            rank=job_env.global_rank,
            world_size=job_env.num_tasks,
            backend="nccl",
        )
    def _init_state(self) -> None:
        """
        Initialize the state and load it from an existing checkpoint if any
        """
        job_env = submitit.JobEnvironment()

        if job_env.global_rank == 0:
            # config_path = Path(args.save_folder) / str(job_env.job_id) / 'config.json'
            os.makedirs(self._train_cfg.output_dir, exist_ok=True)
            config_path = Path(self._train_cfg.output_dir)  / 'config.json'
            with open(config_path, "w") as g:
                g.write(json.dumps(self._train_cfg._asdict()))

        print(f"Setting random seed {self._train_cfg.seed}", flush=True)
        random.seed(self._train_cfg.seed)
        np.random.seed(self._train_cfg.seed)
        torch.manual_seed(self._train_cfg.seed)
        torch.cuda.manual_seed_all(self._train_cfg.seed)

        print("Create data loaders", flush=True)
        tokenizer = AutoTokenizer.from_pretrained(self._train_cfg.model_name)
        collate_fc = partial(mhop_collate, pad_id=tokenizer.pad_token_id)
        train_set = MhopDataset(tokenizer, self._train_cfg.train_file,  self._train_cfg.max_q_len, self._train_cfg.max_q_sp_len, self._train_cfg.max_c_len, train=True)

        self._train_loader = torch.utils.data.DataLoader(train_set, batch_size=self._train_cfg.train_batch_size, num_workers=self._train_cfg.num_workers, collate_fn=collate_fc, shuffle=True)
        test_set = MhopDataset(tokenizer, self._train_cfg.predict_file, self._train_cfg.max_q_len, self._train_cfg.max_q_sp_len, self._train_cfg.max_c_len)

        self._test_loader = torch.utils.data.DataLoader(
            test_set,
            batch_size=self._train_cfg.predict_batch_size,
            num_workers=self._train_cfg.num_workers, collate_fn=collate_fc, pin_memory=True
        )

        print("Create model", flush=True)
        print(f"Local rank {job_env.local_rank}", flush=True)
        bert_config = AutoConfig.from_pretrained(self._train_cfg.model_name)
        if "roberta" in self._train_cfg.model_name:
            model = RobertaRetriever(bert_config, self._train_cfg)
        else:
            model = MhopRetriever(bert_config, self._train_cfg)
        model.cuda(job_env.local_rank)

        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(
                nd in n for nd in no_decay)], 'weight_decay': self._train_cfg.weight_decay},
            {'params': [p for n, p in model.named_parameters() if any(
                nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = Adam(optimizer_parameters, lr=self._train_cfg.learning_rate, eps=self._train_cfg.adam_epsilon)

        if self._train_cfg.fp16:
            model, optimizer = amp.initialize(
                model, optimizer, opt_level=self._train_cfg.fp16_opt_level)

        t_total = len(self._train_loader) // self._train_cfg.gradient_accumulation_steps * self._train_cfg.num_train_epochs
        warmup_steps = t_total * self._train_cfg.warmup_ratio
        lr_scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total
        )
        model = torch.nn.DataParallel(model)
        self._state = TrainerState(
            epoch=0, model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, global_step=0
        )

        self.tb_logger = SummaryWriter(self._train_cfg.output_dir.replace("logs", "tflogs"))

        checkpoint_fn = osp.join(self._train_cfg.output_dir, str(job_env.job_id), "checkpoint.pth")
        # checkpoint_fn = osp.join(self._train_cfg.output_dir, "checkpoint.pth")
        if os.path.isfile(checkpoint_fn):
            print(f"Load existing checkpoint from {checkpoint_fn}", flush=True)
            self._state = TrainerState.load(
                checkpoint_fn, default=self._state, gpu=job_env.local_rank)
Example #20
0
    def _train(self) -> Optional[float]:
        job_env = submitit.JobEnvironment()
        batch_step = 0 # forward batch count
        best_metric = 0
        train_loss_meter = AverageMeter()
        print(f"Start training", flush=True)
        # Start from the loaded epoch
        start_epoch = self._state.epoch
        global_step = self._state.global_step
        for epoch in range(start_epoch, self._train_cfg.num_train_epochs):
            print(f"Start epoch {epoch}", flush=True)
            self._state.model.train()
            self._state.epoch = epoch

            for batch in self._train_loader:
                batch_step += 1
                batch_inputs = move_to_cuda(batch["net_inputs"])
                loss = self._state.model(batch_inputs)
                if torch.cuda.device_count() > 1:
                    loss = loss.mean()
                if self._train_cfg.gradient_accumulation_steps > 1:
                    loss = loss / self._train_cfg.gradient_accumulation_steps
                if self._train_cfg.fp16:
                    with amp.scale_loss(loss, self._state.optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()
                train_loss_meter.update(loss.item())
                if (batch_step + 1) % self._train_cfg.gradient_accumulation_steps == 0:
                    if self._train_cfg.fp16:
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(self._state.optimizer), self._train_cfg.max_grad_norm)
                    else:
                        torch.nn.utils.clip_grad_norm_(
                            self._state.model.parameters(), self._train_cfg.max_grad_norm)
                    self._state.optimizer.step()
                    self._state.lr_scheduler.step()
                    self._state.model.zero_grad()
                    global_step += 1
                    self._state.global_step = global_step

                    self.tb_logger.add_scalar('batch_train_loss',
                                        loss.item(), global_step)
                    self.tb_logger.add_scalar('smoothed_train_loss',
                                        train_loss_meter.avg, global_step)
                    if job_env.global_rank == 0:
                        if self._train_cfg.eval_period != -1 and global_step % self._train_cfg.eval_period == 0:
                            metrics = self._eval()
                            for k, v in metrics.items():
                                self.tb_logger.add_scalar(k, v*100, global_step)
                            score = metrics[self._train_cfg.final_metric]
                            if best_metric < score:
                                print("Saving model with best %s %.2f -> em %.2f" % (self._train_cfg.final_metric, best_metric*100, score*100), flush=True)
                                torch.save(self._state.model.state_dict(), os.path.join(self._train_cfg.output_dir, f"checkpoint_best.pt"))
                                best_metric = score
            # Checkpoint only on the master
            if job_env.global_rank == 0:
                self.checkpoint(rm_init=False)
                metrics = self._eval()
                for k, v in metrics.items():
                    self.tb_logger.add_scalar(k, v*100, global_step)
                score = metrics[self._train_cfg.final_metric]
                if best_metric < score:
                    print("Saving model with best %s %.2f -> em %.2f" % (self._train_cfg.final_metric, best_metric*100, score*100), flush=True)
                    torch.save(self._state.model.state_dict(), os.path.join(self._train_cfg.output_dir, f"checkpoint_best.pt"))
                    best_metric = score
                self.log({
                    "best_score": best_metric,
                    "curr_score": score,
                    "smoothed_loss": train_loss_meter.avg,
                    "epoch": epoch
                })
        return best_metric
Example #21
0
def my_app(cfg: DictConfig):
    env = submitit.JobEnvironment()
    log.info(f"Process ID {os.getpid()} executing task {cfg.task}, with {env}")
    time.sleep(1)
    def _init_state(self) -> None:
        """
        Initialize the state and load it from an existing checkpoint if any
        """
        job_env = submitit.JobEnvironment()

        if job_env.global_rank == 0:
            # config_path = Path(args.save_folder) / str(job_env.job_id) / 'config.json'
            os.makedirs(self._train_cfg.output_dir, exist_ok=True)
            config_path = Path(self._train_cfg.output_dir) / 'config.json'
            with open(config_path, "w") as g:
                g.write(json.dumps(self._train_cfg._asdict()))

        print(f"Setting random seed {self._train_cfg.seed}", flush=True)
        random.seed(self._train_cfg.seed)
        np.random.seed(self._train_cfg.seed)
        torch.manual_seed(self._train_cfg.seed)

        print("Create data loaders", flush=True)
        tokenizer = BertTokenizer.from_pretrained(
            self._train_cfg.bert_model_name)
        collate_fc = sp_collate
        train_set = SPDataset(tokenizer,
                              self._train_cfg.train_file,
                              self._train_cfg.max_q_len,
                              self._train_cfg.max_c_len,
                              train=True)
        # train_sampler = torch.utils.data.distributed.DistributedSampler(
        #     train_set, num_replicas=job_env.num_tasks, rank=job_env.global_rank
        # )
        # self._train_loader = torch.utils.data.DataLoader(
        #     train_set,
        #     batch_size=self._train_cfg.train_batch_size,
        #     num_workers=4,
        #     sampler=train_sampler, collate_fn=collate_fc
        # )
        self._train_loader = torch.utils.data.DataLoader(
            train_set,
            batch_size=self._train_cfg.train_batch_size,
            num_workers=4,
            collate_fn=collate_fc)
        test_set = SPDataset(tokenizer, self._train_cfg.predict_file,
                             self._train_cfg.max_q_len,
                             self._train_cfg.max_c_len)
        self._test_loader = torch.utils.data.DataLoader(
            test_set,
            batch_size=self._train_cfg.predict_batch_size,
            num_workers=4,
            collate_fn=collate_fc)
        print(
            f"Per Node batch_size: {self._train_cfg.train_batch_size // job_env.num_tasks}",
            flush=True)

        print("Create model", flush=True)
        print(f"Local rank {job_env.local_rank}", flush=True)
        bert_config = BertConfig.from_pretrained(
            self._train_cfg.bert_model_name)
        model = BertForRetrieverSP(bert_config, self._train_cfg)
        model.cuda(job_env.local_rank)

        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            self._train_cfg.weight_decay
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_parameters,
                          lr=self._train_cfg.learning_rate)
        lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                            mode='max',
                                                            factor=0.5)

        if self._train_cfg.fp16:
            model, optimizer = amp.initialize(
                model, optimizer, opt_level=self._train_cfg.fp16_opt_level)
        model = torch.nn.DataParallel(model)  #
        self._state = TrainerState(epoch=0,
                                   model=model,
                                   optimizer=optimizer,
                                   lr_scheduler=lr_scheduler,
                                   global_step=0)

        self.tb_logger = SummaryWriter(
            os.path.join(self._train_cfg.output_dir, "tblog"))

        checkpoint_fn = osp.join(self._train_cfg.output_dir,
                                 str(job_env.job_id), "checkpoint.pth")
        # checkpoint_fn = osp.join(self._train_cfg.output_dir, "checkpoint.pth")
        if os.path.isfile(checkpoint_fn):
            print(f"Load existing checkpoint from {checkpoint_fn}", flush=True)
            self._state = TrainerState.load(checkpoint_fn,
                                            default=self._state,
                                            gpu=job_env.local_rank)
Example #23
0
    def _eval(self) -> dict:
        print("Start evaluation of the model", flush=True)
        job_env = submitit.JobEnvironment()
        args = self._train_cfg
        eval_dataloader = self._test_loader
        model = self._state.model
        model.eval()
        id2result = collections.defaultdict(list)
        id2answer = collections.defaultdict(list)
        id2gold = {}
        id2goldsp = {}
        for batch in tqdm(eval_dataloader):
            batch_to_feed = move_to_cuda(batch["net_inputs"])
            batch_qids = batch["qids"]
            batch_labels = batch["net_inputs"]["label"].view(-1).tolist()
            with torch.no_grad():
                outputs = model(batch_to_feed)
                scores = outputs["rank_score"]
                scores = scores.view(-1).tolist()
                sp_scores = outputs["sp_score"]
                sp_scores = sp_scores.float().masked_fill(batch_to_feed["sent_offsets"].eq(0), float("-inf")).type_as(sp_scores)
                batch_sp_scores = sp_scores.sigmoid()
                # ans_type_predicted = torch.argmax(outputs["ans_type_logits"], dim=1).view(-1).tolist()
                outs = [outputs["start_logits"], outputs["end_logits"]]
            for qid, label, score in zip(batch_qids, batch_labels, scores):
                id2result[qid].append((label, score))

            # answer prediction
            span_scores = outs[0][:, :, None] + outs[1][:, None]
            max_seq_len = span_scores.size(1)
            span_mask = np.tril(np.triu(np.ones((max_seq_len, max_seq_len)), 0), args.max_ans_len)
            span_mask = span_scores.data.new(max_seq_len, max_seq_len).copy_(torch.from_numpy(span_mask))
            span_scores_masked = span_scores.float().masked_fill((1 - span_mask[None].expand_as(span_scores)).bool(), -1e10).type_as(span_scores)
            start_position = span_scores_masked.max(dim=2)[0].max(dim=1)[1]
            end_position = span_scores_masked.max(dim=2)[1].gather(
                1, start_position.unsqueeze(1)).squeeze(1)
            answer_scores = span_scores_masked.max(dim=2)[0].max(dim=1)[0].tolist()
            para_offset = batch['para_offsets']
            start_position_ = list(
                np.array(start_position.tolist()) - np.array(para_offset))
            end_position_ = list(
                np.array(end_position.tolist()) - np.array(para_offset))  
 
            for idx, qid in enumerate(batch_qids):
                id2gold[qid] = batch["gold_answer"][idx]
                id2goldsp[qid] = batch["sp_gold"][idx]
                rank_score = scores[idx]
                sp_score = batch_sp_scores[idx].tolist()
                start = start_position_[idx]
                end = end_position_[idx]
                span_score = answer_scores[idx]

                tok_to_orig_index = batch['tok_to_orig_index'][idx]
                doc_tokens = batch['doc_tokens'][idx]
                wp_tokens = batch['wp_tokens'][idx]
                orig_doc_start = tok_to_orig_index[start]
                orig_doc_end = tok_to_orig_index[end]
                orig_tokens = doc_tokens[orig_doc_start:(orig_doc_end + 1)]
                tok_tokens = wp_tokens[start:end+1]
                tok_text = " ".join(tok_tokens)
                tok_text = tok_text.replace(" ##", "")
                tok_text = tok_text.replace("##", "")
                tok_text = tok_text.strip()
                tok_text = " ".join(tok_text.split())
                orig_text = " ".join(orig_tokens)
                pred_str = get_final_text(tok_text, orig_text, do_lower_case=True, verbose_logging=False)

                pred_sp = []
                passages = batch["passages"][idx]
                for passage, sent_offset in zip(passages, [0, len(passages[0]["sents"])]):
                    for idx, _ in enumerate(passage["sents"]):
                        try:
                            if sp_score[idx + sent_offset] > 0.5:
                                pred_sp.append([passage["title"], idx])
                        except:
                            continue
                id2answer[qid].append((pred_str.strip(), rank_score, span_score, pred_sp))

        acc = []
        for qid, res in id2result.items():
            res.sort(key=lambda x: x[1], reverse=True)
            acc.append(res[0][0] == 1)
        print(f"evaluated {len(id2result)} questions...", flush=True)
        print(f'chain ranking em: {np.mean(acc)}', flush=True)

        best_em, best_f1, best_joint_em, best_joint_f1 = 0, 0, 0, 0
        lambdas = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
        for lambda_ in lambdas:
            ems, f1s = [], []
            sp_ems, sp_f1s = [], []
            joint_ems, joint_f1s = [], []
            for qid, res in id2result.items():
                ans_res = id2answer[qid]
                ans_res.sort(key=lambda x: lambda_ * x[1] + (1 - lambda_) * x[2], reverse=True)
                top_pred = ans_res[0][0]
                ems.append(exact_match_score(top_pred, id2gold[qid][0]))
                f1, prec, recall = f1_score(top_pred, id2gold[qid][0])
                f1s.append(f1)

                top_pred_sp = ans_res[0][3]
                metrics = {'sp_em': 0, 'sp_f1': 0, 'sp_prec': 0, 'sp_recall': 0}
                update_sp(metrics, top_pred_sp, id2goldsp[qid])
                sp_ems.append(metrics['sp_em'])
                sp_f1s.append(metrics['sp_f1'])

                # joint metrics
                joint_prec = prec * metrics["sp_prec"]
                joint_recall = recall * metrics["sp_recall"]
                if joint_prec + joint_recall > 0:
                    joint_f1 = 2 * joint_prec * joint_recall / (joint_prec + joint_recall)
                else:
                    joint_f1 = 0
                joint_em = ems[-1] * sp_ems[-1]
                joint_ems.append(joint_em)
                joint_f1s.append(joint_f1)

            if best_joint_f1 < np.mean(joint_f1s):
                best_joint_f1 = np.mean(joint_f1s)
                best_joint_em = np.mean(joint_ems)
                best_f1 = np.mean(f1s)
                best_em = np.mean(ems)

            print(f".......Using combination factor {lambda_}......", flush=True)
            print(f'answer em: {np.mean(ems)}, count: {len(ems)}', flush=True)
            print(f'answer f1: {np.mean(f1s)}, count: {len(f1s)}', flush=True)
            print(f'sp em: {np.mean(sp_ems)}, count: {len(sp_ems)}', flush=True)
            print(f'sp f1: {np.mean(sp_f1s)}, count: {len(sp_f1s)}', flush=True)
            print(f'joint em: {np.mean(joint_ems)}, count: {len(joint_ems)}', flush=True)
            print(f'joint f1: {np.mean(joint_f1s)}, count: {len(joint_f1s)}', flush=True)
        print(f"Best joint EM/F1 from combination {best_em}/{best_f1}", flush=True)

        model.train()
        return {"em": best_em, "f1": best_f1, "joint_em": best_joint_em, "joint_f1": best_joint_f1}
    def _train(self) -> Optional[float]:
        job_env = submitit.JobEnvironment()

        loss_fct = CrossEntropyLoss()
        batch_step = 0  # forward batch count
        best_mrr = 0
        train_loss_meter = AverageMeter()
        print(f"Start training", flush=True)
        # Start from the loaded epoch
        start_epoch = self._state.epoch
        global_step = self._state.global_step
        for epoch in range(start_epoch, self._train_cfg.num_train_epochs):
            print(f"Start epoch {epoch}", flush=True)
            self._state.model.train()
            self._state.epoch = epoch

            for batch in self._train_loader:
                batch_step += 1
                batch = move_to_cuda(batch)
                outputs = self._state.model(batch)
                q = outputs['q']
                c = outputs['c']
                neg_c = outputs['neg_c']
                product_in_batch = torch.mm(q, c.t())
                product_neg = (q * neg_c).sum(-1).unsqueeze(1)
                product = torch.cat([product_in_batch, product_neg], dim=-1)
                target = torch.arange(product.size(0)).to(product.device)
                loss = loss_fct(product, target)

                if self._train_cfg.gradient_accumulation_steps > 1:
                    loss = loss / self._train_cfg.gradient_accumulation_steps
                if self._train_cfg.fp16:
                    with amp.scale_loss(loss,
                                        self._state.optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                train_loss_meter.update(loss.item())
                self.tb_logger.add_scalar('batch_train_loss', loss.item(),
                                          global_step)
                self.tb_logger.add_scalar('smoothed_train_loss',
                                          train_loss_meter.avg, global_step)

                if (batch_step +
                        1) % self._train_cfg.gradient_accumulation_steps == 0:
                    if self._train_cfg.fp16:
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(self._state.optimizer),
                            self._train_cfg.max_grad_norm)
                    else:
                        torch.nn.utils.clip_grad_norm_(
                            self._state.model.parameters(),
                            self._train_cfg.max_grad_norm)
                    self._state.optimizer.step(
                    )  # We have accumulated enought gradients
                    self._state.model.zero_grad()
                    global_step += 1
                    self._state.global_step = global_step

            # Checkpoint only on the master
            # if job_env.global_rank == 0:
            self.checkpoint(rm_init=False)
            mrr = self._eval()
            self.tb_logger.add_scalar('dev_mrr', mrr * 100, epoch)
            self._state.lr_scheduler.step(mrr)
            if best_mrr < mrr:
                print(
                    "Saving model with best MRR %.2f -> MRR %.2f on epoch=%d" %
                    (best_mrr * 100, mrr * 100, epoch))
                torch.save(
                    self._state.model.state_dict(),
                    os.path.join(self._train_cfg.output_dir,
                                 f"checkpoint_best.pt"))
                best_mrr = mrr
            self.log({
                "best_mrr": best_mrr,
                "curr_mrr": mrr,
                "smoothed_loss": train_loss_meter.avg,
                "epoch": epoch
            })
        return best_mrr
Example #25
0
def main(
        init_file,
        path_model="test.t7",
        result_file='blabla.txt',
        dataset="CIFAR10",
        num_classes=10,
        batch_size=256,
        attack="PGDL2",
        eot_samples=1,  # 80
        noise=None,
        batch_prediction=1,
        sigma=0.25,
        save_image=False):

    torch.manual_seed(1234)

    job_env = submitit.JobEnvironment()
    print(job_env)
    torch.cuda.set_device(job_env.local_rank)

    torch.distributed.init_process_group(
        backend="nccl",
        init_method=init_file,
        world_size=job_env.num_tasks,
        rank=job_env.global_rank,
    )
    if noise is None:
        batch_prediction = None
        sigma = None
    # Load inputs
    test_loader = load_data(dataset=dataset,
                            datadir="datasets",
                            batch_size_per_gpu=int(batch_size /
                                                   job_env.num_tasks),
                            job_env=job_env,
                            train_mode=False)
    num_images = len(test_loader.dataset)

    # Classifier  definition
    # torch.nn.Module.dump_patches = True

    # model_load = torch.load(path_model)
    # Classifier = model_load["net"]
    ckpt = torch.load(path_model)
    epoch = ckpt["epoch"]

    model, _ = getNetwork(net_type="wide-resnet",
                          depth=28,
                          widen_factor=10,
                          dropout=0.3,
                          num_classes=num_classes)

    model.load_state_dict(ckpt["model_state_dict"])
    Classifier = RandModel(model, noise=noise, sigma=sigma)
    Classifier.cuda(job_env.local_rank)

    cudnn.benchmark = True
    Classifier = torch.nn.parallel.DistributedDataParallel(
        Classifier,
        device_ids=[job_env.local_rank],
        output_device=job_env.local_rank)

    print("Classifier intialized")
    for i in range(torch.cuda.device_count()):
        print(torch.cuda.get_device_name(i))
    # print(Classifier)
    Classifier.eval()

    adversaries = dict()

    adversaries["CW"] = attacks.CarliniWagnerL2Attack(Classifier,
                                                      num_classes,
                                                      learning_rate=0.01,
                                                      binary_search_steps=9,
                                                      max_iterations=60,
                                                      abort_early=True,
                                                      initial_const=0.001,
                                                      clip_min=0.0,
                                                      clip_max=1.)

    adversaries["EAD"] = attacks.ElasticNetL1Attack(Classifier,
                                                    num_classes,
                                                    confidence=0,
                                                    targeted=False,
                                                    learning_rate=0.01,
                                                    binary_search_steps=9,
                                                    max_iterations=60,
                                                    abort_early=True,
                                                    initial_const=1e-3,
                                                    clip_min=0.,
                                                    clip_max=1.,
                                                    beta=1e-3,
                                                    decision_rule='EN')

    adversaries["PGDL1"] = attacks.SparseL1PGDAttack(Classifier,
                                                     eps=10.,
                                                     nb_iter=40,
                                                     eps_iter=2 * 10. / 40,
                                                     rand_init=False,
                                                     clip_min=0.0,
                                                     clip_max=1.0,
                                                     sparsity=0.05,
                                                     eot_samples=eot_samples)

    adversaries["PGDLinf"] = attacks.LinfPGDAttack(Classifier,
                                                   eps=0.031,
                                                   nb_iter=40,
                                                   eps_iter=2 * 0.031 / 40,
                                                   rand_init=True,
                                                   clip_min=0.0,
                                                   clip_max=1.0,
                                                   eot_samples=eot_samples)

    adversaries["PGDL2"] = attacks.L2PGDAttack(Classifier,
                                               eps=2.,
                                               nb_iter=40,
                                               eps_iter=2 * 2. / 40,
                                               rand_init=True,
                                               clip_min=0.0,
                                               clip_max=1.0,
                                               eot_samples=eot_samples)

    adversaries["FGSM"] = attacks.GradientSignAttack(Classifier,
                                                     loss_fn=None,
                                                     eps=0.05,
                                                     clip_min=0.,
                                                     clip_max=1.,
                                                     targeted=False,
                                                     eot_samples=eot_samples)

    current_num_input = 0
    running_acc = 0

    if attack is not None:
        norms_l1 = []
        norms_l2 = []
        norms_linf = []

    for i, data in enumerate(test_loader, 0):
        if i > 0 and save_image:
            break
        # get the inputs
        inputs, labels = data
        inputs, labels = inputs.cuda(job_env.local_rank), labels.cuda(
            job_env.local_rank)
        if (i == 0) and save_image and (job_env.global_rank == 0):
            torchvision.utils.save_image(inputs,
                                         "images_nat.jpg",
                                         nrow=8,
                                         padding=2,
                                         normalize=False,
                                         range=None,
                                         scale_each=False,
                                         pad_value=0)
        if attack is not None:
            inputs_adv = adversaries[attack].perturb(inputs, labels)

            norms_l1_batch = get_lp_norm(inputs_adv - inputs, p=1)
            norms_l2_batch = get_lp_norm(inputs_adv - inputs, p=2)
            norms_linf_batch = get_lp_norm(inputs_adv - inputs, p=np.inf)

            norms_l1.append(norms_l1_batch)
            norms_l2.append(norms_l2_batch)
            norms_linf.append(norms_linf_batch)

            inputs = inputs_adv
            if (i == 0) and save_image and (job_env.global_rank == 0):
                torchvision.utils.save_image(inputs,
                                             "images_adv.jpg",
                                             nrow=8,
                                             padding=2,
                                             normalize=False,
                                             range=None,
                                             scale_each=False,
                                             pad_value=0)

        with torch.no_grad():
            if noise is None:
                outputs = Classifier(inputs)
                _, predicted = torch.max(outputs.data, 1)

            else:
                outputs = torch.FloatTensor(labels.shape[0],
                                            num_classes).cuda()
                outputs.zero_()
                for _ in range(batch_prediction):
                    outputs += Classifier(inputs)
                _, predicted = torch.max(outputs.data, 1)

        # print statistics
        running_acc += predicted.eq(labels.data).cpu().sum().numpy()

        curr_batch_size = inputs.size(0)
        current_num_input += curr_batch_size
        print("[", (i + 1) * batch_size, "/", num_images, "] running_acc=",
              running_acc / current_num_input)

    running_acc = torch.Tensor([running_acc]).cuda(job_env.local_rank)
    torch.distributed.all_reduce(running_acc,
                                 op=torch.distributed.ReduceOp.SUM)

    accuracy = (running_acc / num_images).cpu().sum().numpy()
    print(accuracy)
    if attack is not None:
        norms_l1 = torch.cat(norms_l1).view(-1)
        norms_l2 = torch.cat(norms_l2).view(-1)
        norms_linf = torch.cat(norms_linf).view(-1)

        norms_l1_gathered = all_gather(norms_l1)
        norms_l2_gathered = all_gather(norms_l2)
        norms_linf_gathered = all_gather(norms_linf)

        norms_l1_gathered = torch.cat(norms_l1_gathered).view(
            -1).detach().cpu().numpy()
        norms_l2_gathered = torch.cat(norms_l2_gathered).view(
            -1).detach().cpu().numpy()
        norms_linf_gathered = torch.cat(norms_linf_gathered).view(
            -1).detach().cpu().numpy()
    if job_env.global_rank == 0:
        if attack is not None:
            np.save(result_file + "_" + attack + "_l1norm", norms_l1_gathered)
            np.save(result_file + "_" + attack + "_l2norm", norms_l2_gathered)
            np.save(result_file + "_" + attack + "_linfnorm",
                    norms_linf_gathered)
        with open(result_file + ".txt", 'a') as f:
            f.write('{} {} {} {} {} {} {}\n'.format(epoch, dataset, noise,
                                                    batch_prediction, attack,
                                                    eot_samples, accuracy))

    torch.distributed.barrier()
    torch.distributed.destroy_process_group()
    print(job_env.local_rank, job_env.global_rank)
    return job_env.local_rank, job_env.global_rank
Example #26
0
    def _init_state(self) -> None:
        """
        Initialize the state and load it from an existing checkpoint if any
        """
        job_env = submitit.JobEnvironment()

        if job_env.global_rank == 0:
            # config_path = Path(args.save_folder) / str(job_env.job_id) / 'config.json'
            os.makedirs(self._train_cfg.output_dir, exist_ok=True)
            config_path = Path(self._train_cfg.output_dir)  / 'config.json'
            with open(config_path, "w") as g:
                g.write(json.dumps(self._train_cfg._asdict()))

        print(f"Setting random seed {self._train_cfg.seed}", flush=True)
        random.seed(self._train_cfg.seed)
        np.random.seed(self._train_cfg.seed)
        torch.manual_seed(self._train_cfg.seed)

        print("Create data loaders", flush=True)
        tokenizer = AutoTokenizer.from_pretrained(self._train_cfg.model_name)
        collate_fc = partial(rank_collate, pad_id=tokenizer.pad_token_id)
        train_set = RankingDataset(tokenizer, self._train_cfg.train_file, self._train_cfg.max_seq_len, self._train_cfg.max_q_len, train=True)

        train_sampler = MhopSampler(train_set, num_neg=self._train_cfg.neg_num)

        batch_size_per_gpu = (1 + self._train_cfg.neg_num) * self._train_cfg.num_q_per_gpu
        n_gpu = torch.cuda.device_count()
        print(f"Number of GPUs: {n_gpu}", flush=True)
        print(f"Batch size per node: {batch_size_per_gpu * n_gpu}", flush=True)

        self._train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size_per_gpu * n_gpu, num_workers=self._train_cfg.num_workers, collate_fn=collate_fc, sampler=train_sampler)
        test_set = RankingDataset(tokenizer, self._train_cfg.predict_file, self._train_cfg.max_seq_len, self._train_cfg.max_q_len)
        self._test_loader = torch.utils.data.DataLoader(
            test_set,
            batch_size=self._train_cfg.predict_batch_size,
            num_workers=self._train_cfg.num_workers, collate_fn=collate_fc
        )

        print("Create model", flush=True)
        print(f"Local rank {job_env.local_rank}", flush=True)
        bert_config = AutoConfig.from_pretrained(self._train_cfg.model_name)
        model = QAModel(bert_config, self._train_cfg)
        model.cuda(job_env.local_rank)

        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(
                nd in n for nd in no_decay)], 'weight_decay': self._train_cfg.weight_decay},
            {'params': [p for n, p in model.named_parameters() if any(
                nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

        if self._train_cfg.use_adam:
            optimizer = optim.Adam(optimizer_parameters, lr=self._train_cfg.learning_rate)
        else:
            optimizer = AdamW(optimizer_parameters, lr=self._train_cfg.learning_rate)
        # lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2)

        if self._train_cfg.fp16:
            model, optimizer = amp.initialize(
                model, optimizer, opt_level=self._train_cfg.fp16_opt_level)

        t_total = len(self._train_loader) // self._train_cfg.gradient_accumulation_steps * self._train_cfg.num_train_epochs
        warmup_steps = t_total * self._train_cfg.warmup_ratio
        lr_scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total
        )

        model = torch.nn.DataParallel(model)
        self._state = TrainerState(
            epoch=0, model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, global_step=0
        )
        self.tb_logger = SummaryWriter(self._train_cfg.output_dir.replace("logs", "tflogs"))

        checkpoint_fn = osp.join(self._train_cfg.output_dir, str(job_env.job_id), "checkpoint.pth")
        # checkpoint_fn = osp.join(self._train_cfg.output_dir, "checkpoint.pth")
        if os.path.isfile(checkpoint_fn):
            print(f"Load existing checkpoint from {checkpoint_fn}", flush=True)
            self._state = TrainerState.load(
                checkpoint_fn, default=self._state, gpu=job_env.local_rank)
Example #27
0
    def __init__(self, opt, dset=None):
        logdir = opt.logdir
        ###
        try:
            import submitit
            job_env = submitit.JobEnvironment()
            logdir = logdir.replace('%j', str(job_env.job_id))
            opt.logdir = logdir
        except:
            print('No job id found')
        ###
        if opt.ngpus > 1:
            opt.bSz = opt.bSz * opt.ngpus
            opt.n_workers = int(min(opt.n_workers * opt.ngpus, 20))
        self.opt = opt
        print(f'Training with opts: {opt}')

        self.writer = SummaryWriter(logdir)
        print(f'Log dir: {self.writer.log_dir}')
        self.writer.add_text('opts', str(opt), 0)

        # Fix seed
        if opt.seed: torch.manual_seed(opt.seed)

        # depending on the chosen architecture adapt training image size
        if '224' in opt.feat_arch:
            opt.iSz = 224
            print(f'Using iSz: {opt.iSz}')
        else:
            print(f'Continuing with iSz: {opt.iSz}')

        # construct train dataset or use provided one
        if dset is None:
            self.traindset = get_dataset(opt.dataset,
                                         classDset=True,
                                         iSz=opt.iSz)
        else:
            self.traindset = dset

        print(self.traindset)
        print(self.traindset.classes[0].samples[0])
        print('Train dataset class length histogram')
        print(np.histogram([len(c) for c in self.traindset.classes]))
        self.ttype = 'IN' if opt.benchmark == 'IN' else 'miniIN'
        self.traindset.transform = get_transform(self.ttype,
                                                 phase='train',
                                                 do_normalize=True,
                                                 iSz=opt.iSz)
        print('Train transform: ', self.traindset.transform)
        # construct dataloader
        self.init_dataloader(self.traindset)

        # construct validation/test dataset
        self.get_val_test_sets()
        print('val dataset: ', self.valdset)
        print('test dataset: ', self.testdset)

        # verify image size
        assert opt.iSz in [224, 84], f' Got iSz: {opt.iSz}'

        # construct model
        self.model = Model(feat_arch=opt.feat_arch,
                           nClasses=len(self.traindset.classes))
        if opt.ngpus > 1:
            self.model = torch.nn.DataParallel(self.model,
                                               device_ids=range(opt.ngpus))
            print('Using ')
        self.model.cuda()
        print(self.model)

        if opt.steps is None:
            opt.steps = get_steps(len(self.traindset), bSz=opt.bSz)
        print(f'Using steps: {opt.steps}')
        opt.max_iter = opt.steps[-1]

        # setup optimizer and scheduler
        self.optimizer = torch.optim.SGD(self.model.parameters(),
                                         lr=opt.lr,
                                         momentum=opt.momentum,
                                         weight_decay=opt.wd,
                                         nesterov=opt.nesterov)
        self.scheduler = MultiStepWarmupLR(self.optimizer,
                                           milestones=opt.steps,
                                           gamma=opt.gamma,
                                           warmup_steps=opt.warmup_steps)

        self.iteration = 0

        self.ims = torch.FloatTensor().cuda()
        self.targets = torch.LongTensor().cuda()
        self.best_5shot = 0
        self.best_ckpt_file = os.path.join(self.writer.log_dir,
                                           'best_checkpoint.pth')

        cudnn.benchmark = True
        print(
            f'Dataset size: {len(self.traindset)}, bSz: {opt.bSz}, steps: {opt.steps}, len dataloader {len(self.trainloader)}'
        )
def main(init_file, path_model="model_test/blabla",
         dataset='ImageNet', num_classes=1000,
         epochs=200, batch_size=64,
         resume_epoch=0, save_frequency=2,
         adversarial_training=None, attack_list=["PGDLinf", "PGDL2"],
         eot_samples=1,
         noise=None, sigma=0.25):

    torch.manual_seed(1234)

    job_env = submitit.JobEnvironment()
    print(job_env)
    torch.cuda.set_device(job_env.local_rank)

    torch.distributed.init_process_group(
        backend="nccl",
        init_method=init_file,
        world_size=job_env.num_tasks,
        rank=job_env.global_rank,
    )

    print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")

    if not os.path.exists(path_model):
        os.makedirs(path_model)

    # Load inputs
    if dataset == "ImageNet":
        train_loader = load_data(dataset=dataset,
                                 datadir="/datasets01_101/imagenet_full_size/061417/",  # to adapt
                                 batch_size_per_gpu=int(batch_size/job_env.num_tasks),
                                 job_env=job_env, train_mode=True)
    else:
        train_loader = load_data(dataset=dataset, datadir="datasets",
                                 batch_size_per_gpu=int(batch_size/job_env.num_tasks),
                                 job_env=job_env, train_mode=True)

    num_images = len(train_loader.dataset)
    # Classifier  definition
    if dataset == "ImageNet":
        # Classifier = models.resnet18(pretrained=False)

        Classifier, modelname = getNetwork(net_type='inceptionresnetv2', num_classes=num_classes)
    else:
        Classifier, modelname = getNetwork(net_type="wide-resnet", depth=28, widen_factor=10,
                                           dropout=0.3, num_classes=num_classes)
        Classifier.apply(conv_init)

    Classifier = RandModel(Classifier, noise=noise, sigma=sigma)
    Classifier.cuda(job_env.local_rank)

    cudnn.benchmark = True
    Classifier = torch.nn.parallel.DistributedDataParallel(
        Classifier, device_ids=[job_env.local_rank], output_device=job_env.local_rank)
    Classifier.train()
    print("Classifier initialized")
    # optimizer and criterion
    if adversarial_training == "MixMax":
        criterion = torch.nn.CrossEntropyLoss(reduction="none").cuda(job_env.local_rank)
    else:
        criterion = torch.nn.CrossEntropyLoss().cuda(job_env.local_rank)

    optimizer = torch.optim.SGD(
        Classifier.parameters(), lr=0.1*batch_size/256, momentum=0.9, weight_decay=5e-4)

    if dataset != "ImageNet":
        scheduler = get_scheduler(optimizer, policy="multistep", milestones=[
            60, 120, 160], gamma=0.2)
    else:
        scheduler = get_scheduler(optimizer, policy="multistep", milestones=[
            30, 60, 90], gamma=0.2)

    # resume learning
    if resume_epoch > 0:
        if os.path.isfile(path_model):
            print("=> loading checkpoint '{}'".format(path_model))
            checkpoint = torch.load(path_model)
            Classifier = checkpoint['net']
            print("=> loaded checkpoint (epoch {})".format(
                checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(path_model))

    adversaries = dict()

    adversaries["CW"] = attacks.CarliniWagnerL2Attack(Classifier, num_classes,
                                                      learning_rate=0.01, binary_search_steps=9,
                                                      max_iterations=15, abort_early=True,
                                                      initial_const=0.001, clip_min=0.0, clip_max=1.)

    adversaries["EAD"] = attacks.ElasticNetL1Attack(Classifier, num_classes,
                                                    confidence=0,
                                                    targeted=False, learning_rate=0.01,
                                                    binary_search_steps=9, max_iterations=60,
                                                    abort_early=True, initial_const=1e-3,
                                                    clip_min=0., clip_max=1., beta=1e-3, decision_rule='EN')

    adversaries["PGDL1"] = attacks.SparseL1PGDAttack(Classifier, eps=10., nb_iter=10, eps_iter=2*10./10,
                                                     rand_init=False, clip_min=0.0, clip_max=1.0,
                                                     sparsity=0.05, eot_samples=eot_samples)

    adversaries["PGDLinf"] = attacks.LinfPGDAttack(Classifier, eps=0.031, nb_iter=10, eps_iter=2*0.031/10,
                                                   rand_init=True, clip_min=0.0, clip_max=1.0, eot_samples=eot_samples)

    adversaries["PGDL2"] = attacks.L2PGDAttack(Classifier, eps=2., nb_iter=10, eps_iter=2*2./10,
                                               rand_init=True, clip_min=0.0, clip_max=1.0, eot_samples=eot_samples)

    adversaries["FGSM"] = attacks.GradientSignAttack(Classifier, loss_fn=None, eps=0.05, clip_min=0.,
                                                     clip_max=1., targeted=False, eot_samples=eot_samples)
    # TO add L1 attacks

    for epoch in range(epochs):
        current_num_input = 0

        running_loss = 0.0
        running_acc = 0

        start_time_epoch = time.time()
        for i, data in enumerate(train_loader, 0):

            inputs, labels = data
            inputs, labels = inputs.cuda(job_env.local_rank), labels.cuda(job_env.local_rank)

            if adversarial_training is None:
                outputs = Classifier(inputs)
                optimizer.zero_grad()
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

            if adversarial_training == "Single":
                inputs_adv = adversaries[attack_list[0]].perturb(
                    inputs, labels)
                outputs = Classifier(inputs_adv)
                optimizer.zero_grad()
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

            elif adversarial_training == "MixMean":
                loss = 0
                for att in attack_list:
                    inputs_adv = adversaries[att].perturb(inputs, labels)
                    outputs = Classifier(inputs_adv)
                    loss += criterion(outputs, labels)
                loss /= len(attack_list)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            elif adversarial_training == "MixRand":
                att = random.choice(attack_list)
                inputs_adv = adversaries[att].perturb(inputs, labels)
                outputs = Classifier(inputs_adv)
                loss = criterion(outputs, labels)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            elif adversarial_training == "MixMax":
                loss = torch.zeros_like(labels).float()
                for att in attack_list:
                    inputs_adv = adversaries[att].perturb(inputs, labels)
                    outputs = Classifier(inputs_adv)
                    l = criterion(outputs, labels).float()
                    loss = torch.max(loss, l)
                loss = loss.mean()
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            with torch.no_grad():
                outputs = Classifier(inputs)
                _, predicted = torch.max(outputs.data, 1)

            running_loss += loss.item()
            running_acc += predicted.eq(labels.data).cpu().sum().numpy()
            curr_batch_size = inputs.size(0)

            if i % 5 == 4
            print("Epoch :[", epoch+1, "/", epochs,
                  "] [", i*batch_size, "/", num_images,
                  "] Running loss:", running_loss/5,
                  ", Running accuracy:", running_acc/(5*curr_batch_size), " time:", time.time()-start_time_epoch)
            running_loss = 0.0
            running_acc = 0

        # save model
        if ((epoch + 1) % save_frequency == 0) and (job_env.global_rank == 0):

            state = {
                'epoch': epoch + 1,
                'model_state_dict': Classifier.state_dict(),
            }
            torch.save(state, os.path.join(
                path_model, "epoch_"+str(epoch+1)+'.t7'))

        scheduler.step()


if __name__ == "__main__":
    main()
Example #29
0
    def init_fn(self):

        job_env = submitit.JobEnvironment()

        self.train_ds = MixedDataset(self.options,
                                     ignore_3d=self.options.ignore_3d,
                                     is_train=True)

        self.model = hmr(config.SMPL_MEAN_PARAMS,
                         pretrained=True).to(self.device)
        self.model.cuda(job_env.local_rank)
        self.model = torch.nn.parallel.DistributedDataParallel(
            self.model,
            device_ids=[job_env.local_rank],
            output_device=job_env.local_rank)

        if self.options.bExemplarMode:
            lr = 5e-5 * 0.2
        else:
            lr = self.options.lr
        self.optimizer = torch.optim.Adam(
            params=self.model.parameters(),
            #   lr=self.options.lr,
            lr=lr,
            weight_decay=0)

        if self.options.bUseSMPLX:  #SMPL-X model           #No change is required for HMR training. SMPL-X ignores hand and other parts.
            #SMPL uses 23 joints, while SMPL-X uses 21 joints, automatically ignoring the last two joints of SMPL
            self.smpl = SMPLX(config.SMPL_MODEL_DIR,
                              batch_size=self.options.batch_size,
                              create_transl=False).to(self.device)
        else:  #Original SMPL
            self.smpl = SMPL(config.SMPL_MODEL_DIR,
                             batch_size=self.options.batch_size,
                             create_transl=False).to(self.device)

        # Per-vertex loss on the shape
        self.criterion_shape = nn.L1Loss().to(self.device)
        # Keypoint (2D and 3D) loss
        # No reduction because confidence weighting needs to be applied
        self.criterion_keypoints = nn.MSELoss(reduction='none').to(self.device)
        # Loss for SMPL parameter regression
        self.criterion_regr = nn.MSELoss().to(self.device)
        self.models_dict = {'model': self.model}
        self.optimizers_dict = {'optimizer': self.optimizer}
        self.focal_length = constants.FOCAL_LENGTH

        # Initialize SMPLify fitting module
        self.smplify = SMPLify(step_size=1e-2,
                               batch_size=self.options.batch_size,
                               num_iters=self.options.num_smplify_iters,
                               focal_length=self.focal_length)
        if self.options.pretrained_checkpoint is not None:
            print(">>> Load Pretrained mode: {}".format(
                self.options.pretrained_checkpoint))
            self.load_pretrained(
                checkpoint_file=self.options.pretrained_checkpoint)
            self.backupModel()

        #This should be called here after loading model
        # if torch.cuda.device_count() > 1:
        assert torch.cuda.device_count() > 1
        print("Let's use", torch.cuda.device_count(), "GPUs!")

        # self.model = torch.nn.DataParallel(self.model)      #Failed...
        # self.model.cuda(job_env.local_rank)

        # Load dictionary of fits
        self.fits_dict = FitsDict(self.options, self.train_ds)

        # Create renderer
        self.renderer = None  # Renderer(focal_length=self.focal_length, img_res=self.options.img_res, faces=self.smpl.faces)

        #debug
        from torchvision.transforms import Normalize
        self.de_normalize_img = Normalize(mean=[
            -constants.IMG_NORM_MEAN[0] / constants.IMG_NORM_STD[0],
            -constants.IMG_NORM_MEAN[1] / constants.IMG_NORM_STD[1],
            -constants.IMG_NORM_MEAN[2] / constants.IMG_NORM_STD[2]
        ],
                                          std=[
                                              1 / constants.IMG_NORM_STD[0],
                                              1 / constants.IMG_NORM_STD[1],
                                              1 / constants.IMG_NORM_STD[2]
                                          ])
    def _train(self) -> Optional[float]:
        job_env = submitit.JobEnvironment()

        batch_step = 0 # forward batch count
        best_mrr = 0
        train_loss_meter = AverageMeter()
        print(f"Start training", flush=True)
        # Start from the loaded epoch
        start_epoch = self._state.epoch
        global_step = self._state.global_step
        for epoch in range(start_epoch, self._train_cfg.num_train_epochs):
            print(f"Start epoch {epoch}", flush=True)
            self._state.model.train()
            self._state.epoch = epoch
            for batch in self._train_loader:
                batch_step += 1
                batch = move_to_cuda(batch)
                loss = mhop_loss(self._state.model, batch, self._train_cfg)

                if self._train_cfg.gradient_accumulation_steps > 1:
                    loss = loss / self._train_cfg.gradient_accumulation_steps
                if self._train_cfg.fp16:
                    with amp.scale_loss(loss, self._state.optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()
                
                train_loss_meter.update(loss.item())

                if (batch_step + 1) % self._train_cfg.gradient_accumulation_steps == 0:
                    if self._train_cfg.fp16:
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(self._state.optimizer), self._train_cfg.max_grad_norm)
                    else:
                        torch.nn.utils.clip_grad_norm_(
                            self._state.model.parameters(), self._train_cfg.max_grad_norm)
                    self._state.optimizer.step()
                    self._state.lr_scheduler.step()
                    self._state.model.zero_grad()

                    global_step += 1
                    self._state.global_step = global_step

                    self.tb_logger.add_scalar('batch_train_loss',
                                        loss.item(), global_step)
                    self.tb_logger.add_scalar('smoothed_train_loss',
                                        train_loss_meter.avg, global_step)

            # Checkpoint only on the master
            # if job_env.global_rank == 0:
            self.checkpoint(rm_init=False)
            mrrs = self._eval()
            mrr = mrrs["mrr_avg"]
            self.tb_logger.add_scalar('dev_mrr', mrr*100, epoch)
            self._state.lr_scheduler.step(mrr)
            if best_mrr < mrr:
                print("Saving model with best MRR %.2f -> MRR %.2f on epoch=%d" % (best_mrr*100, mrr*100, epoch))
                torch.save(self._state.model.state_dict(), os.path.join(self._train_cfg.output_dir, f"checkpoint_best.pt"))
                best_mrr = mrr
            self.log({
                "best_mrr": best_mrr,
                "curr_mrr": mrr,
                "smoothed_loss": train_loss_meter.avg,
                "epoch": epoch
            })
        return best_mrr