コード例 #1
0
def train_epoch(logger, loader, model, optimizer, scheduler):
    if cfg.benchmark:
        from torch_geometric.graphgym.benchmark import global_line_profiler, \
            torchprof, get_memory_status, count_parameters, \
            get_gpu_memory_nvdia, get_model_size
    model.train()
    time_start = time.time()
    with torchprof.Profile(model, use_cuda=True, profile_memory=True):
        for batch in loader:
            batch.split = 'train'
            optimizer.zero_grad()
            batch.to(torch.device(cfg.device))
            pred, true = model(batch)
            loss, pred_score = compute_loss(pred, true)
            loss.backward()
            optimizer.step()

            curr_mem_usage = get_memory_status(global_line_profiler)

            logger.update_stats(true=true.detach().cpu(),
                                pred=pred_score.detach().cpu(),
                                loss=loss.item(),
                                lr=scheduler.get_last_lr()[0],
                                time_used=time.time() - time_start,
                                params=cfg.params,
                                allocated_cuda=curr_mem_usage[0],
                                reserved_cuda=curr_mem_usage[1],
                                active_cuda=curr_mem_usage[2],
                                nvidia_cuda_free=get_gpu_memory_nvdia()[0],
                                nvidia_cuda_used=get_gpu_memory_nvdia()[1],
                                trainable_params=count_parameters(model),
                                model_size=get_model_size(model))
            time_start = time.time()
        scheduler.step()
コード例 #2
0
 def _shared_step(self, batch, split: str) -> Dict:
     batch.split = split
     pred, true = self(batch)
     loss, pred_score = compute_loss(pred, true)
     step_end_time = time.time()
     return dict(loss=loss, true=true, pred_score=pred_score,
                 step_end_time=step_end_time)
コード例 #3
0
def eval_epoch(logger, loader, model, split='val'):
    model.eval()
    time_start = time.time()
    for batch in loader:
        batch.split = split
        batch.to(torch.device(cfg.device))
        pred, true = model(batch)
        loss, pred_score = compute_loss(pred, true)
        logger.update_stats(true=true.detach().cpu(),
                            pred=pred_score.detach().cpu(), loss=loss.item(),
                            lr=0, time_used=time.time() - time_start,
                            params=cfg.params)
        time_start = time.time()
コード例 #4
0
def train_epoch(logger, loader, model, optimizer, scheduler):
    model.train()
    time_start = time.time()
    for batch in loader:
        optimizer.zero_grad()
        batch.to(torch.device(cfg.device))
        pred, true = model(batch)
        loss, pred_score = compute_loss(pred, true)
        loss.backward()
        optimizer.step()
        logger.update_stats(true=true.detach().cpu(),
                            pred=pred_score.detach().cpu(),
                            loss=loss.item(),
                            lr=scheduler.get_last_lr()[0],
                            time_used=time.time() - time_start,
                            params=cfg.params)
        time_start = time.time()
    scheduler.step()