Example #1
0
def evaluate(model, criterion, data_loader, device):
    model.eval()
    metric_logger = MetricLogger(delimiter="  ")
    header = 'Test:'
    with torch.no_grad():
        for video, target in metric_logger.log_every(data_loader, 100, header):
            start_time = time.time()
            video = video.to(device, non_blocking=True)
            target = target.to(device, non_blocking=True)
            output = model(video)
            time_diff = time.time() - start_time
            print("Predicting on a video of shape {} took {} seconds".format(
                video.shape, time_diff))
            print("target shape {}".format(target.shape))
            print("target {}".format(target))
            loss = criterion(output, target)

            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            # FIXME need to take into account that the datasets
            # could have been padded in distributed setup
            batch_size = video.shape[0]
            metric_logger.update(loss=loss.item())
            metric_logger.meters['acc1'].update(acc1.item(), n=batch_size)
            metric_logger.meters['acc5'].update(acc5.item(), n=batch_size)

    print(
        ' * Clip Acc@1 {top1.global_avg:.3f} Clip Acc@5 {top5.global_avg:.3f}'.
        format(top1=metric_logger.acc1, top5=metric_logger.acc5))
    return metric_logger.acc1.global_avg
Example #2
0
def evaluate(model, epoch, criterion, data_loader, device, writer):
    model.eval()
    metric_logger = MetricLogger(delimiter="  ")
    header = 'Test:'
    cntr = 0
    running_accuracy = 0.0
    with torch.no_grad():
        for video, target in metric_logger.log_every(data_loader, 100, header):
            video = video.to(device, non_blocking=True)
            target = target.to(device, non_blocking=True)
            output = model(video)
            loss = criterion(output, target)

            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            # FIXME need to take into account that the datasets
            # could have been padded in distributed setup
            batch_size = video.shape[0]
            running_accuracy += acc1.item()
            if cntr % 10 == 9:  # average loss over the accumulated mini-batch
                writer.add_scalar('validation accuracy',
                                  running_accuracy / 10,
                                  epoch * len(data_loader) + cntr)
                running_accuracy = 0.0
            cntr += 1
            metric_logger.update(loss=loss.item())
            metric_logger.meters['acc1'].update(acc1.item(), n=batch_size)
            metric_logger.meters['acc5'].update(acc5.item(), n=batch_size)

    print(' * Clip Acc@1 {top1.global_avg:.3f} Clip Acc@5 {top5.global_avg:.3f}'
          .format(top1=metric_logger.acc1, top5=metric_logger.acc5))
    return metric_logger.acc1.global_avg
Example #3
0
def train_one_epoch(model,
                    optimizer,
                    lr_scheduler,
                    data_loader,
                    epoch,
                    print_freq,
                    checkpoint_fn=None):
    model.train()
    metric_logger = MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value}'))
    metric_logger.add_meter('batch/s',
                            SmoothedValue(window_size=10, fmt='{value:.3f}'))

    header = 'Epoch: [{}]'.format(epoch)

    for step, batched_inputs in enumerate(
            metric_logger.log_every(data_loader, print_freq, header)):
        start_time = time.time()
        loss = model(batched_inputs)

        if checkpoint_fn is not None and np.random.random() < 0.005:
            checkpoint_fn()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        metric_logger.update(loss=loss.item(),
                             lr=optimizer.param_groups[0]["lr"])
        metric_logger.meters['batch/s'].update((time.time() - start_time))
        lr_scheduler.step()

    if checkpoint_fn is not None:
        checkpoint_fn()
Example #4
0
def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch):

    model.train()

    sums = defaultdict(lambda: 0.0)
    start1 = time()

    metric = MetricLogger("train_iteration")
    metric["epoch"] = epoch

    for waveform, specgram, target in bg_iterator(data_loader, maxsize=2):

        start2 = time()

        waveform = waveform.to(device)
        specgram = specgram.to(device)
        target = target.to(device)

        output = model(waveform, specgram)
        output, target = output.squeeze(1), target.squeeze(1)

        loss = criterion(output, target)
        loss_item = loss.item()
        sums["loss"] += loss_item
        metric["loss"] = loss_item

        optimizer.zero_grad()
        loss.backward()

        if args.clip_grad > 0:
            gradient = torch.nn.utils.clip_grad_norm_(
                model.parameters(), args.clip_grad
            )
            sums["gradient"] += gradient.item()
            metric["gradient"] = gradient.item()

        optimizer.step()

        metric["iteration"] = sums["iteration"]
        metric["time"] = time() - start2
        metric()
        sums["iteration"] += 1

    avg_loss = sums["loss"] / len(data_loader)

    metric = MetricLogger("train_epoch")
    metric["epoch"] = epoch
    metric["loss"] = sums["loss"] / len(data_loader)
    metric["gradient"] = avg_loss
    metric["time"] = time() - start1
    metric()
Example #5
0
def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch):

    model.train()

    sums = defaultdict(lambda: 0.0)
    start1 = time()

    metric = MetricLogger("train_iteration")
    metric["epoch"] = epoch

    for i, batch in enumerate(data_loader):

        start2 = time()

        adjust_learning_rate(epoch, optimizer, args.learning_rate,
                             args.anneal_steps, args.anneal_factor)

        model.zero_grad()
        x, y, _ = batch_to_gpu(batch)

        y_pred = model(x)
        loss = criterion(y_pred, y)

        loss_item = loss.item()
        sums["loss"] += loss_item
        metric["loss"] = loss_item

        optimizer.zero_grad()
        loss.backward()

        if args.clip_grad > 0:
            gradient = torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                      args.clip_grad)
            sums["gradient"] += gradient.item()
            metric["gradient"] = gradient.item()

        optimizer.step()

        metric["iteration"] = sums["iteration"]
        metric["time"] = time() - start2
        metric()
        sums["iteration"] += 1

    avg_loss = sums["loss"] / len(data_loader)

    metric = MetricLogger("train_epoch")
    metric["epoch"] = epoch
    metric["loss"] = sums["loss"] / len(data_loader)
    metric["gradient"] = avg_loss
    metric["time"] = time() - start1
    metric()
Example #6
0
def validate(model, criterion, data_loader, device, epoch):

    with torch.no_grad():

        model.eval()
        sums = defaultdict(lambda: 0.0)
        start = time()

        for i, batch in enumerate(data_loader):

            start2 = time()

            x, y, _ = batch_to_gpu(batch)

            y_pred = model(x)
            loss = criterion(y_pred, y)

            loss_item = loss.item()
            sums["loss"] += loss_item

        avg_loss = sums["loss"] / len(data_loader)

        metric = MetricLogger("validation")
        metric["epoch"] = epoch
        metric["loss"] = avg_loss
        metric["time"] = time() - start
        metric()

        return avg_loss
Example #7
0
def validate(model, criterion, data_loader, device, epoch):

    with torch.no_grad():

        model.eval()
        sums = defaultdict(lambda: 0.0)
        start = time()

        for waveform, specgram, target in bg_iterator(data_loader, maxsize=2):

            waveform = waveform.to(device)
            specgram = specgram.to(device)
            target = target.to(device)

            output = model(waveform, specgram)
            output, target = output.squeeze(1), target.squeeze(1)

            loss = criterion(output, target)
            sums["loss"] += loss.item()

        avg_loss = sums["loss"] / len(data_loader)

        metric = MetricLogger("validation")
        metric["epoch"] = epoch
        metric["loss"] = avg_loss
        metric["time"] = time() - start
        metric()

        return avg_loss
Example #8
0
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq):
    model.train()
    metric_logger = MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr',
                            SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = warmup_lr_scheduler(optimizer, warmup_iters,
                                           warmup_factor)

    for images, targets in metric_logger.log_every(data_loader, print_freq,
                                                   header):
        images = list(image.to(device) for image in images)
        #images = list(np.array(img) for img in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

    return metric_logger
Example #9
0
def train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader,
                    device, epoch, print_freq, writer):
    model.train()
    metric_logger = MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value}'))
    metric_logger.add_meter('clips/s',
                            SmoothedValue(window_size=10, fmt='{value:.3f}'))
    running_loss = 0.0
    running_accuracy = 0.0
    header = 'Epoch: [{}]'.format(epoch)
    cntr = 0
    for video, target in metric_logger.log_every(data_loader, print_freq,
                                                 header):
        start_time = time.time()
        video, target = video.to(device), target.to(device)
        output = model(video)
        loss = criterion(output, target)

        optimizer.zero_grad()

        loss.backward()
        optimizer.step()

        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        batch_size = video.shape[0]
        running_loss += loss.item()
        running_accuracy += acc1.item()
        if cntr % 10 == 9:  #average loss over the accumulated mini-batch
            writer.add_scalar('training loss', running_loss / 10,
                              epoch * len(data_loader) + cntr)
            writer.add_scalar('learning rate', optimizer.param_groups[0]["lr"],
                              epoch * len(data_loader) + cntr)
            writer.add_scalar('accuracy', running_accuracy / 10,
                              epoch * len(data_loader) + cntr)
            running_loss = 0.0
            running_accuracy = 0.0
        cntr = cntr + 1
        metric_logger.update(loss=loss.item(),
                             lr=optimizer.param_groups[0]["lr"])
        metric_logger.meters['acc1'].update(acc1.item(), n=batch_size)
        metric_logger.meters['acc5'].update(acc5.item(), n=batch_size)
        metric_logger.meters['clips/s'].update(batch_size /
                                               (time.time() - start_time))
        lr_scheduler.step()
Example #10
0
def evaluate(
    model,
    criterion,
    data_loader,
    decoder,
    language_model,
    device,
    epoch,
    disable_logger=False,
):

    with torch.no_grad():

        model.eval()
        start = time()
        metric = MetricLogger("validation", disable=disable_logger)
        metric["epoch"] = epoch

        for inputs, targets, tensors_lengths, target_lengths in bg_iterator(
                data_loader, maxsize=2):

            inputs = inputs.to(device, non_blocking=True)
            targets = targets.to(device, non_blocking=True)

            # keep batch first for data parallel
            outputs = model(inputs).transpose(-1, -2).transpose(0, 1)

            # CTC
            # outputs: input length, batch size, number of classes (including blank)
            # targets: batch size, max target length
            # input_lengths: batch size
            # target_lengths: batch size

            metric["cumulative loss"] += criterion(outputs, targets,
                                                   tensors_lengths,
                                                   target_lengths).item()

            metric["dataset length"] += len(inputs)
            metric["iteration"] += 1

            compute_error_rates(outputs, targets, decoder, language_model,
                                metric)

        metric[
            "average loss"] = metric["cumulative loss"] / metric["iteration"]
        metric["validation time"] = time() - start
        metric()

        return metric["average loss"]
Example #11
0
    def train_eval_mse(self, model, valset, writer, global_step, device):
        """
        Evaluate MSE during training
        """
        num_samples = eval_cfg.train.num_samples.mse
        batch_size = eval_cfg.train.batch_size
        num_workers = eval_cfg.train.num_workers

        model.eval()

        if valset == None:
            data_set = np.load('../data/TABLE/val/all_set_val.npy')
            data_size = len(data_set)
            # creates indexes and shuffles them. So it can acces the data
            idx_set = np.arange(data_size)
            np.random.shuffle(idx_set)
            idx_set = idx_set[:num_samples]
            idx_set = np.split(idx_set, len(idx_set) / batch_size)
            data_to_enumerate = idx_set
        else:
            valset = Subset(valset, indices=range(num_samples))
            dataloader = DataLoader(valset,
                                    batch_size=batch_size,
                                    num_workers=num_workers,
                                    shuffle=False)
            data_to_enumerate = dataloader

        metric_logger = MetricLogger()

        print(f'Evaluating MSE using {num_samples} samples.')
        with tqdm(total=num_samples) as pbar:
            for batch_idx, sample in enumerate(data_to_enumerate):
                if valset == None:
                    data_i = data_set[sample]
                    data_i = torch.from_numpy(data_i).float().to(device)
                    data_i /= 255
                    data_i = data_i.permute([0, 3, 1, 2])
                    imgs = data_i
                else:
                    imgs = sample.to(device)
                loss, log = model(imgs, global_step)
                B = imgs.size(0)
                for b in range(B):
                    metric_logger.update(mse=log['mse'][b], )
                metric_logger.update(loss=loss.mean())
                pbar.update(B)

        assert metric_logger['mse'].count == num_samples
        # Add last log
        # log.update([(k, torch.tensor(v.global_avg)) for k, v in metric_logger.values.items()])
        mse = metric_logger['mse'].global_avg
        writer.add_scalar(f'val/mse', mse, global_step=global_step)

        model.train()

        return mse
Example #12
0
def evaluate(model, data_loader, device):
    n_threads = torch.get_num_threads()
    # FIXME remove this and make paste_masks_in_image run on the GPU
    torch.set_num_threads(1)
    cpu_device = torch.device("cpu")
    model.eval()
    metric_logger = MetricLogger(delimiter="  ")
    header = 'Test:'

    coco = get_coco_api_from_dataset(data_loader.dataset)
    iou_types = _get_iou_types(model)
    coco_evaluator = CocoEvaluator(coco, iou_types)

    for images, targets in metric_logger.log_every(data_loader, 100, header):
        images = list(img.to(device) for img in images)

        if torch.cuda.is_available():
            torch.cuda.synchronize()
        model_time = time.time()
        outputs = model(images)

        outputs = [{k: v.to(cpu_device)
                    for k, v in t.items()} for t in outputs]
        model_time = time.time() - model_time

        res = {
            target["image_id"].item(): output
            for target, output in zip(targets, outputs)
        }
        evaluator_time = time.time()
        coco_evaluator.update(res)
        evaluator_time = time.time() - evaluator_time
        metric_logger.update(model_time=model_time,
                             evaluator_time=evaluator_time)

    # gather the stats from all processes
    metric_logger.synchronize_between_processes()
    print("Averaged stats:", metric_logger)
    coco_evaluator.synchronize_between_processes()

    # accumulate predictions from all images
    coco_evaluator.accumulate()
    coco_evaluator.summarize()
    torch.set_num_threads(n_threads)
    return coco_evaluator
Example #13
0
    def train_eval_mse(self, model, valset, writer, global_step, device):
        """
        Evaluate MSE during training
        """
        num_samples = eval_cfg.train.num_samples.mse
        batch_size = eval_cfg.train.batch_size
        num_workers = eval_cfg.train.num_workers
        
        model.eval()
        # valset = Subset(valset, indices=range(num_samples))
        dataloader = DataLoader(valset, batch_size=batch_size, shuffle=True, num_workers=0,
                                              drop_last=True, collate_fn=valset.collate_fn)
        #dataloader = DataLoader(valset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
    
        metric_logger = MetricLogger()
    
        print(f'Evaluating MSE using {num_samples} samples.')
        n_batch = 0
        with tqdm(total=num_samples) as pbar:
            for batch_idx, sample in enumerate(dataloader):
                imgs = sample[0].to(device)
                loss, log = model(imgs, global_step)
                B = imgs.size(0)
                for b in range(B):
                    metric_logger.update(
                        mse=log['mse'][b],
                    )
                metric_logger.update(loss=loss.mean())
                pbar.update(1)
                n_batch += 1
                if n_batch >= num_samples: break

        # assert metric_logger['mse'].count == num_samples
        # Add last log
        # log.update([(k, torch.tensor(v.global_avg)) for k, v in metric_logger.values.items()])
        mse = metric_logger['mse'].global_avg
        writer.add_scalar(f'val/mse', mse, global_step=global_step)
    
        model.train()
        
        return mse
Example #14
0
def train_one_epoch(train_loader, model, criterion, optimizer, writer, epoch,
                    total_step, config):
    log_header = 'EPOCH {}'.format(epoch)
    losses = AverageMeter('Loss', fmt=':.4f')
    if config.method != 'byol':
        top1 = AverageMeter('Acc1', fmt=':4.2f')
        top5 = AverageMeter('Acc5', fmt=':4.2f')
    lr = AverageMeter('Lr', fmt=":.6f")

    metric_logger = MetricLogger(delimeter=" | ")
    metric_logger.add_meter(losses)
    if config.method != 'byol':
        metric_logger.add_meter(top1)
        metric_logger.add_meter(top5)
    metric_logger.add_meter(lr)
    # ce = nn.CrossEntropyLoss().cuda(config.system.gpu)
    # num_steps_per_epoch = int(len(train_loader.dataset) // config.train.batch_size)
    # global_step = num_steps_per_epoch * epoch
    for step, (images, _) in enumerate(
            metric_logger.log_every(train_loader, config.system.print_freq,
                                    log_header)):
        total_step.val += 1
        if config.system.gpu is not None:
            images[0] = images[0].cuda(config.system.gpu, non_blocking=True)
            images[1] = images[1].cuda(config.system.gpu, non_blocking=True)

        # [pos, neg]
        # output = model(view_1=images[0], view_2=images[1])
        # loss, logits, targets = criterion(output)
        if config.method != 'byol':
            logits, targets, logits_original = model(view_1=images[0],
                                                     view_2=images[1])
            loss = criterion(logits, targets)
            acc1, acc5 = accuracy(logits_original, targets, topk=(1, 5))
        else:
            loss_pre = model(view_1=images[0], view_2=images[1])
            loss = loss_pre.mean()

        lr_ = optimizer.param_groups[0]['lr']

        if config.method != 'byol':
            metric_logger.update(Loss=loss.detach().cpu().item(),
                                 Acc1=acc1.detach().cpu().item(),
                                 Acc5=acc5.detach().cpu().item(),
                                 Lr=lr_)
        else:
            metric_logger.update(Loss=loss.detach().cpu().item(), Lr=lr_)

        writer.add_scalar('loss', loss.detach().cpu().item(), total_step.val)
        if config.method != 'byol':
            writer.add_scalar('top1',
                              acc1.detach().cpu().item(), total_step.val)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
Example #15
0
def train(proc_id, n_gpus, args, devices, movielens):
    # Start up distributed training, if enabled.
    dev_id = devices[proc_id]
    if n_gpus > 1:
        dist_init_method = 'tcp://{master_ip}:{master_port}'.format(
            master_ip='127.0.0.1', master_port='12345')
        world_size = n_gpus
        th.distributed.init_process_group(backend="nccl",
                                          init_method=dist_init_method,
                                          world_size=world_size,
                                          rank=proc_id)
    th.cuda.set_device(dev_id)
    # set random seed in each gpu
    th.manual_seed(args.seed)
    if th.cuda.is_available():
        th.cuda.manual_seed_all(args.seed)

    # Split train_dataset and set dataloader
    train_rating_pairs = th.split(th.stack(movielens.train_rating_pairs),
                                  len(movielens.train_rating_values) //
                                  args.n_gpus,
                                  dim=1)[proc_id]
    train_rating_values = th.split(movielens.train_rating_values,
                                   len(movielens.train_rating_values) //
                                   args.n_gpus,
                                   dim=0)[proc_id]

    train_dataset = MovieLensDataset(train_rating_pairs, train_rating_values,
                                     movielens.train_graph, args.hop,
                                     args.sample_ratio, args.max_nodes_per_hop)
    train_loader = th.utils.data.DataLoader(train_dataset,
                                            batch_size=args.batch_size,
                                            shuffle=True,
                                            num_workers=args.num_workers,
                                            collate_fn=collate_movielens)
    if proc_id == 0:
        if args.testing:
            test_dataset = MovieLensDataset(movielens.test_rating_pairs,
                                            movielens.test_rating_values,
                                            movielens.train_graph, args.hop,
                                            args.sample_ratio,
                                            args.max_nodes_per_hop)
        else:
            test_dataset = MovieLensDataset(movielens.valid_rating_pairs,
                                            movielens.valid_rating_pairs,
                                            movielens.train_graph, args.hop,
                                            args.sample_ratio,
                                            args.max_nodes_per_hop)
        test_loader = th.utils.data.DataLoader(test_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=False,
                                               num_workers=args.num_workers,
                                               collate_fn=collate_movielens)

    model = IGMC(
        in_feats=(args.hop + 1) * 2,
        latent_dim=[32, 32, 32, 32],
        num_relations=5,  #dataset_base.num_rating, 
        num_bases=4,
        regression=True,
        edge_dropout=args.edge_dropout,
        #  side_features=args.use_features,
        #  n_side_features=n_features,
        #  multiply_by=args.multiply_by
    ).to(dev_id)
    if n_gpus > 1:
        model = DistributedDataParallel(model,
                                        device_ids=[dev_id],
                                        output_device=dev_id)
    loss_fn = nn.MSELoss().to(dev_id)
    optimizer = optim.Adam(model.parameters(),
                           lr=args.train_lr,
                           weight_decay=0)

    if proc_id == 0:
        print("Loading network finished ...\n")
        # prepare the logger
        logger = MetricLogger(args.save_dir, args.valid_log_interval)

        best_epoch = 0
        best_rmse = np.inf
        print("Start training ...")

    for epoch_idx in range(1, args.train_epochs + 1):
        if proc_id == 0:
            print('Epoch', epoch_idx)

        train_loss = train_epoch(proc_id, n_gpus, model, loss_fn, optimizer,
                                 args.arr_lambda, train_loader, dev_id,
                                 args.train_log_interval)

        if n_gpus > 1:
            th.distributed.barrier()
        if proc_id == 0:
            test_rmse = evaluate(model, test_loader, dev_id)
            eval_info = {
                'epoch': epoch_idx,
                'train_loss': train_loss,
                'test_rmse': test_rmse,
            }
            print(
                '=== Epoch {}, train loss {:.6f}, test rmse {:.6f} ==='.format(
                    *eval_info.values()))

            if epoch_idx % args.train_lr_decay_step == 0:
                for param in optimizer.param_groups:
                    param['lr'] = args.train_lr_decay_factor * param['lr']

            logger.log(eval_info, model, optimizer)
            if best_rmse > test_rmse:
                best_rmse = test_rmse
                best_epoch = epoch_idx

    if n_gpus > 1:
        th.distributed.barrier()
    if proc_id == 0:
        eval_info = "Training ends. The best testing rmse is {:.6f} at epoch {}".format(
            best_rmse, best_epoch)
        print(eval_info)
        with open(os.path.join(args.save_dir, 'log.txt'), 'a') as f:
            f.write(eval_info)
Example #16
0
def train(args):
    print(args)

    dataset = DataSetLoader(args.data_name,
                            args.device,
                            use_one_hot_fea=args.use_one_hot_fea,
                            symm=args.gcn_agg_norm_symm,
                            test_ratio=args.data_test_ratio,
                            valid_ratio=args.data_valid_ratio,
                            sample_rate=args.sample_rate)

    print("Loading data finished ...\n")

    args.src_in_units = dataset.user_feature_shape[1]
    args.dst_in_units = dataset.movie_feature_shape[1]
    args.rating_vals = dataset.possible_rating_values

    ### build the net
    #args.decoder = "MLP"
    net = Net(args=args)

    #print(args)
    net = net.to(args.device)
    nd_possible_rating_values = th.FloatTensor(
        dataset.possible_rating_values).to(args.device)
    rating_loss_net = nn.CrossEntropyLoss()
    learning_rate = args.train_lr
    optimizer = get_optimizer(args.train_optimizer)(net.parameters(),
                                                    lr=learning_rate)
    print("Loading network finished ...\n")

    ### perpare training data
    train_gt_labels = dataset.train_labels

    train_gt_ratings = dataset.train_truths

    ### prepare the logger
    train_loss_logger = MetricLogger(
        ['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'],
        os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id))
    valid_loss_logger = MetricLogger(
        ['iter', 'rmse', "ndcg_20", "ndcg_40", "ndcg_80"],
        ['%d', '%.4f', '%.4f', '%.4f', '%.4f'],
        os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id))
    test_loss_logger = MetricLogger(
        ['iter', 'rmse', "ndcg_20", "ndcg_40", "ndcg_80"],
        ['%d', '%.4f', '%.4f', '%.4f', '%.4f'],
        os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id))

    ### declare the loss information
    best_valid_rmse = np.inf
    best_valid_ndcg = -np.inf
    best_test_ndcg = -np.inf
    no_better_valid = 0
    best_iter = -1
    count_rmse = 0
    count_num = 0
    count_loss = 0

    dataset.train_enc_graph = dataset.train_enc_graph.int().to(args.device)
    dataset.train_dec_graph = dataset.train_dec_graph.int().to(args.device)

    dataset.valid_enc_graph = dataset.train_enc_graph
    dataset.valid_dec_graph = dataset.valid_dec_graph.int().to(args.device)
    dataset.valid_recall_dec_graph = dataset.valid_recall_dec_graph.to(
        args.device)

    dataset.test_enc_graph = dataset.test_enc_graph.int().to(args.device)
    dataset.test_dec_graph = dataset.test_dec_graph.int().to(args.device)
    dataset.test_recall_dec_graph = dataset.test_recall_dec_graph.to(
        args.device)

    print("Start training ...")
    dur = []
    for iter_idx in range(1, args.train_max_iter):
        '''
        noisy_labels = th.LongTensor(np.random.choice([-1, 0, 1], train_gt_ratings.shape[0], replace=True, p=[0.001, 0.998, 0.001])).to(args.device)

        train_gt_labels += noisy_labels
    
        max_label = dataset.max_l + th.zeros_like(train_gt_labels)
        min_label = dataset.min_l + th.zeros_like(train_gt_labels)
        max_label = max_label.long()
        min_label = min_label.long()
        train_gt_labels = th.where(train_gt_labels > max_label, max_label, train_gt_labels)
        train_gt_labels = th.where(train_gt_labels < min_label, min_label, train_gt_labels)
        '''

        if iter_idx > 3:
            t0 = time.time()
        net.train()
        if iter_idx > 250:
            Two_Stage = True
        else:
            Two_Stage = False
        Two_Stage = False
        pred_ratings, reg_loss, user_out, movie_out, W = net(
            dataset.train_enc_graph, dataset.train_dec_graph,
            dataset.user_feature, dataset.movie_feature, Two_Stage)
        #print("user_out:\n", user_out[0])
        #print("movie_out:\n", movie_out[0])
        #print("W:\n", W.shape)
        if args.loss_func == "CE":
            loss = rating_loss_net(
                pred_ratings, train_gt_labels).mean() + args.ARR * reg_loss
            '''
            real_pred_ratings = (th.softmax(pred_ratings, dim=1) *
                                nd_possible_rating_values.view(1, -1)).sum(dim=1)
            mse_loss = th.sum((real_pred_ratings - train_gt_ratings) ** 2)
            loss += mse_loss * 0.0001
            '''
        elif args.loss_func == "Hinge":
            real_pred_ratings = (th.softmax(pred_ratings, dim=1) *
                                 nd_possible_rating_values.view(1, -1)).sum(
                                     dim=1)
            gap = (real_pred_ratings - train_gt_labels)**2
            hinge_loss = th.where(gap > 1.0, gap * gap, gap).mean()
            loss = hinge_loss
        elif args.loss_func == "MSE":
            '''
            seeds = th.arange(pred_ratings.shape[0])
            random.shuffle(seeds)
            for i in range((pred_ratings.shape[0] - 1) // 50 + 1):
                start = i * 50
                end = (i + 1) * 50
                if end > (pred_ratings.shape[0] - 1):
                    end = pred_ratings.shape[0] - 1
                batch = seeds[start:end]
                loss = F.mse_loss(pred_ratings[batch, 0], nd_possible_rating_values[train_gt_labels[batch]]) + args.ARR * reg_loss
                count_loss += loss.item() * 50 / pred_ratings.shape[0]
                optimizer.zero_grad()
                loss.backward(retain_graph=True)
                #nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip)
                optimizer.step()
                pred_ratings, reg_loss = net(dataset.train_enc_graph, dataset.train_dec_graph,
                                   dataset.user_feature, dataset.movie_feature)
            '''
            loss = th.mean((pred_ratings[:, 0] -
                            nd_possible_rating_values[train_gt_labels])**
                           2) + args.ARR * reg_loss
        count_loss += loss.item()
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip)
        optimizer.step()

        if iter_idx > 3:
            dur.append(time.time() - t0)

        if iter_idx == 1:
            print("Total #Param of net: %d" % (torch_total_param_num(net)))
            print(
                torch_net_info(net,
                               save_path=os.path.join(
                                   args.save_dir, 'net%d.txt' % args.save_id)))

        if args.loss_func == "CE":
            real_pred_ratings = (th.softmax(pred_ratings, dim=1) *
                                 nd_possible_rating_values.view(1, -1)).sum(
                                     dim=1)
        elif args.loss_func == "MSE":
            real_pred_ratings = pred_ratings[:, 0]
        rmse = ((real_pred_ratings - train_gt_ratings)**2).sum()
        count_rmse += rmse.item()
        count_num += pred_ratings.shape[0]

        if iter_idx % args.train_log_interval == 0:
            train_loss_logger.log(iter=iter_idx,
                                  loss=count_loss / (iter_idx + 1),
                                  rmse=count_rmse / count_num)
            logging_str = "Iter={}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format(
                iter_idx, count_loss / iter_idx, count_rmse / count_num,
                np.average(dur))
            count_rmse = 0
            count_num = 0

        if iter_idx % args.train_valid_interval == 0:
            valid_rmse = evaluate(args=args,
                                  net=net,
                                  dataset=dataset,
                                  segment='valid')
            ndcg_valid = evaluate_metric(args=args,
                                         net=net,
                                         dataset=dataset,
                                         segment='valid',
                                         debug=False)
            print("ndcg_valid:", ndcg_valid)
            valid_loss_logger.log(iter=iter_idx,
                                  rmse=valid_rmse,
                                  ndcg_20=ndcg_valid[0],
                                  ndcg_40=ndcg_valid[1],
                                  ndcg_80=ndcg_valid[2])
            print("-" * 80)

            #test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test')
            #test_loss_logger.log(iter=iter_idx, rmse=test_rmse, ndcg_20 = ndcg_k[0], ndcg_40 = ndcg_k[1], ndcg_80 = ndcg_k[2])
            #logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
            logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse)
            logging_str += ',\tndcg_valid_20={:.4f}'.format(ndcg_valid[0])
            logging_str += ',\tndcg_valid_40={:.4f}'.format(ndcg_valid[1])
            logging_str += ',\tndcg_valid_80={:.4f}'.format(ndcg_valid[2])

            ndcg_valid_20 = ndcg_valid[0]
            #print("***********",ndcg_valid_20)

            if ndcg_valid_20 > best_valid_ndcg:
                best_valid_ndcg = ndcg_valid_20
                print("************best_valid_ndcg:", best_valid_ndcg)
                print("************ndcg_valid_20:", ndcg_valid_20)
                no_better_valid = 0
                best_iter = iter_idx
                test_rmse = evaluate(args=args,
                                     net=net,
                                     dataset=dataset,
                                     segment='test',
                                     debug=True,
                                     idx=iter_idx)
                ndcg_test = evaluate_metric(args=args,
                                            net=net,
                                            dataset=dataset,
                                            segment='test',
                                            debug=False)
                logging_str += ',\tbest ndcg_test={:.4f}'.format(ndcg_test[0])
                logging_str += ',\tbest ndcg_test={:.4f}'.format(ndcg_test[1])
                logging_str += ',\tbest ndcg_test={:.4f}'.format(ndcg_test[2])
                #best_test_rmse = test_rmse
                best_test_ndcg = ndcg_test
                #test_loss_logger.log(iter=iter_idx, rmse=test_rmse)
                test_loss_logger.log(iter=iter_idx,
                                     rmse=test_rmse,
                                     ndcg_20=ndcg_test[0],
                                     ndcg_40=ndcg_test[1],
                                     ndcg_80=ndcg_test[2])
                #logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
            else:
                no_better_valid += 1
                if no_better_valid > args.train_early_stopping_patience\
                    and learning_rate <= args.train_min_lr:
                    logging.info(
                        "Early stopping threshold reached. Stop training.")
                    break
                if no_better_valid > args.train_decay_patience:
                    new_lr = max(learning_rate * args.train_lr_decay_factor,
                                 args.train_min_lr)
                    if new_lr < learning_rate:
                        learning_rate = new_lr
                        logging.info("\tChange the LR to %g" % new_lr)
                        for p in optimizer.param_groups:
                            p['lr'] = learning_rate
                        no_better_valid = 0
            #print("************best_valid_ndcg:",best_valid_ndcg)
            #print("************ndcg_valid_20:",ndcg_valid_20)
        if iter_idx % args.train_log_interval == 0:
            print(logging_str)
    print(
        'Best Iter Idx={}, best ndcg_20={:.4f}, best ndcg_40={:.4f}, best ndcg_80={:.4f}'
        .format(best_iter, best_test_ndcg[0], best_test_ndcg[1],
                best_test_ndcg[2]))
    train_loss_logger.close()
    valid_loss_logger.close()
    test_loss_logger.close()
Example #17
0
def train_one_epoch(
    model,
    criterion,
    optimizer,
    scheduler,
    data_loader,
    decoder,
    language_model,
    device,
    epoch,
    clip_grad,
    disable_logger=False,
    reduce_lr_on_plateau=False,
):

    model.train()

    metric = MetricLogger("train", disable=disable_logger)
    metric["epoch"] = epoch

    for inputs, targets, tensors_lengths, target_lengths in bg_iterator(
            data_loader, maxsize=2):

        start = time()
        inputs = inputs.to(device, non_blocking=True)
        targets = targets.to(device, non_blocking=True)

        # keep batch first for data parallel
        outputs = model(inputs).transpose(-1, -2).transpose(0, 1)

        # CTC
        # outputs: input length, batch size, number of classes (including blank)
        # targets: batch size, max target length
        # input_lengths: batch size
        # target_lengths: batch size

        loss = criterion(outputs, targets, tensors_lengths, target_lengths)

        optimizer.zero_grad()
        loss.backward()

        if clip_grad > 0:
            metric["gradient"] = torch.nn.utils.clip_grad_norm_(
                model.parameters(), clip_grad)

        optimizer.step()

        compute_error_rates(outputs, targets, decoder, language_model, metric)

        try:
            metric["lr"] = scheduler.get_last_lr()[0]
        except AttributeError:
            metric["lr"] = optimizer.param_groups[0]["lr"]

        metric["batch size"] = len(inputs)
        metric["n_channel"] = inputs.shape[1]
        metric["n_time"] = inputs.shape[-1]
        metric["dataset length"] += metric["batch size"]
        metric["iteration"] += 1
        metric["loss"] = loss.item()
        metric["cumulative loss"] += metric["loss"]
        metric[
            "average loss"] = metric["cumulative loss"] / metric["iteration"]
        metric["iteration time"] = time() - start
        metric["epoch time"] += metric["iteration time"]
        metric()

    if reduce_lr_on_plateau and isinstance(scheduler, ReduceLROnPlateau):
        scheduler.step(metric["average loss"])
    elif not isinstance(scheduler, ReduceLROnPlateau):
        scheduler.step()
Example #18
0
def train(cfg):
    torch.manual_seed(cfg.seed)
    np.random.seed(cfg.seed)
    torch.manual_seed(cfg.seed)
    torch.cuda.manual_seed(cfg.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # Some info
    print('Experiment name:', cfg.exp_name)
    print('Model name:', cfg.model)
    print('Dataset:', cfg.dataset)
    print('Resume:', cfg.resume)
    if cfg.resume:
        print('Checkpoint:',
              cfg.resume_ckpt if cfg.resume_ckpt else 'last checkpoint')
    print('Using device:', cfg.device)
    if 'cuda' in cfg.device:
        print('Using parallel:', cfg.parallel)
    if cfg.parallel:
        print('Device ids:', cfg.device_ids)

    print('\nLoading data...')

    trainloader = get_dataloader(cfg, 'train')
    if cfg.val.ison or cfg.vis.ison:
        valset = get_dataset(cfg, 'val')
        valloader = get_dataloader(cfg, 'val')
    print('Data loaded.')

    print('Initializing model...')
    model = get_model(cfg)
    model = model.to(cfg.device)
    print('Model initialized.')
    model.train()

    optimizer = get_optimizer(cfg, model)

    # Checkpointer will print information.
    checkpointer = Checkpointer(os.path.join(cfg.checkpointdir, cfg.exp_name),
                                max_num=cfg.train.max_ckpt)

    start_epoch = 0
    start_iter = 0
    global_step = 0
    if cfg.resume:
        checkpoint = checkpointer.load(cfg.resume_ckpt, model, optimizer)
        if checkpoint:
            start_epoch = checkpoint['epoch']
            global_step = checkpoint['global_step'] + 1
    if cfg.parallel:
        model = nn.DataParallel(model, device_ids=cfg.device_ids)

    writer = SummaryWriter(log_dir=os.path.join(cfg.logdir, cfg.exp_name),
                           purge_step=global_step,
                           flush_secs=30)
    metric_logger = MetricLogger()
    vis_logger = get_vislogger(cfg)
    evaluator = get_evaluator(cfg)

    print('Start training')
    end_flag = False
    for epoch in range(start_epoch, cfg.train.max_epochs):
        if end_flag: break
        start = time.perf_counter()
        for i, data in enumerate(trainloader):
            end = time.perf_counter()
            data_time = end - start
            start = end

            imgs, *_ = [d.to(cfg.device) for d in data]
            model.train()
            loss, log = model(imgs, global_step)
            # If you are using DataParallel
            loss = loss.mean()
            optimizer.zero_grad()
            loss.backward()
            if cfg.train.clip_norm:
                clip_grad_norm_(model.parameters(), cfg.train.clip_norm)
            optimizer.step()

            end = time.perf_counter()
            batch_time = end - start

            metric_logger.update(data_time=data_time)
            metric_logger.update(batch_time=batch_time)
            metric_logger.update(loss=loss.item())

            if (global_step + 1) % cfg.train.print_every == 0:
                start = time.perf_counter()
                log.update(loss=metric_logger['loss'].median)
                vis_logger.model_log_vis(writer, log, global_step + 1)
                end = time.perf_counter()
                device_text = cfg.device_ids if cfg.parallel else cfg.device
                print(
                    'exp: {}, device: {}, epoch: {}, iter: {}/{}, global_step: {}, loss: {:.2f}, batch time: {:.4f}s, data time: {:.4f}s, log time: {:.4f}s'
                    .format(cfg.exp_name, device_text, epoch + 1, i + 1,
                            len(trainloader), global_step + 1,
                            metric_logger['loss'].median,
                            metric_logger['batch_time'].avg,
                            metric_logger['data_time'].avg, end - start))

            if (global_step + 1) % cfg.train.save_every == 0:
                start = time.perf_counter()
                checkpointer.save(model, optimizer, epoch, global_step)
                print('Saving checkpoint takes {:.4f}s.'.format(
                    time.perf_counter() - start))

            if (global_step + 1) % cfg.vis.vis_every == 0 and cfg.vis.ison:
                print('Doing visualization...')
                start = time.perf_counter()
                vis_logger.train_vis(model,
                                     valset,
                                     writer,
                                     global_step,
                                     cfg.vis.indices,
                                     cfg.device,
                                     cond_steps=cfg.vis.cond_steps,
                                     fg_sample=cfg.vis.fg_sample,
                                     bg_sample=cfg.vis.bg_sample,
                                     num_gen=cfg.vis.num_gen)
                print(
                    'Visualization takes {:.4f}s.'.format(time.perf_counter() -
                                                          start))

            if (global_step + 1) % cfg.val.val_every == 0 and cfg.val.ison:
                print('Doing evaluation...')
                start = time.perf_counter()
                evaluator.train_eval(
                    evaluator, os.path.join(cfg.evaldir,
                                            cfg.exp_name), cfg.val.metrics,
                    cfg.val.eval_types, cfg.val.intervals, cfg.val.cond_steps,
                    model, valset, valloader, cfg.device, writer, global_step,
                    [model, optimizer, epoch, global_step], checkpointer)
                print('Evaluation takes {:.4f}s.'.format(time.perf_counter() -
                                                         start))

            start = time.perf_counter()
            global_step += 1
            if global_step >= cfg.train.max_steps:
                end_flag = True
                break
def train(args):
    print(args)
    dataset = MovieLens(
        args.data_name,
        args.ctx,
        use_one_hot_fea=args.use_one_hot_fea,
        symm=args.gcn_agg_norm_symm,
        test_ratio=args.data_test_ratio,
        valid_ratio=args.data_valid_ratio,
    )
    print("Loading data finished ...\n")

    args.src_in_units = dataset.user_feature_shape[1]
    args.dst_in_units = dataset.movie_feature_shape[1]
    args.rating_vals = dataset.possible_rating_values

    ### build the net
    net = Net(args=args)
    net.initialize(init=mx.init.Xavier(factor_type="in"), ctx=args.ctx)
    net.hybridize()
    nd_possible_rating_values = mx.nd.array(dataset.possible_rating_values,
                                            ctx=args.ctx,
                                            dtype=np.float32)
    rating_loss_net = gluon.loss.SoftmaxCELoss()
    rating_loss_net.hybridize()
    trainer = gluon.Trainer(net.collect_params(), args.train_optimizer,
                            {"learning_rate": args.train_lr})
    print("Loading network finished ...\n")

    ### perpare training data
    train_gt_labels = dataset.train_labels
    train_gt_ratings = dataset.train_truths

    ### prepare the logger
    train_loss_logger = MetricLogger(
        ["iter", "loss", "rmse"],
        ["%d", "%.4f", "%.4f"],
        os.path.join(args.save_dir, "train_loss%d.csv" % args.save_id),
    )
    valid_loss_logger = MetricLogger(
        ["iter", "rmse"],
        ["%d", "%.4f"],
        os.path.join(args.save_dir, "valid_loss%d.csv" % args.save_id),
    )
    test_loss_logger = MetricLogger(
        ["iter", "rmse"],
        ["%d", "%.4f"],
        os.path.join(args.save_dir, "test_loss%d.csv" % args.save_id),
    )

    ### declare the loss information
    best_valid_rmse = np.inf
    no_better_valid = 0
    best_iter = -1
    avg_gnorm = 0
    count_rmse = 0
    count_num = 0
    count_loss = 0

    print("Start training ...")
    dur = []
    for iter_idx in range(1, args.train_max_iter):
        if iter_idx > 3:
            t0 = time.time()
        with mx.autograd.record():
            pred_ratings = net(
                dataset.train_enc_graph,
                dataset.train_dec_graph,
                dataset.user_feature,
                dataset.movie_feature,
            )
            loss = rating_loss_net(pred_ratings, train_gt_labels).mean()
            loss.backward()

        count_loss += loss.asscalar()
        gnorm = params_clip_global_norm(net.collect_params(),
                                        args.train_grad_clip, args.ctx)
        avg_gnorm += gnorm
        trainer.step(1.0)
        if iter_idx > 3:
            dur.append(time.time() - t0)

        if iter_idx == 1:
            print("Total #Param of net: %d" % (gluon_total_param_num(net)))
            print(
                gluon_net_info(net,
                               save_path=os.path.join(
                                   args.save_dir, "net%d.txt" % args.save_id)))

        real_pred_ratings = (mx.nd.softmax(pred_ratings, axis=1) *
                             nd_possible_rating_values.reshape(
                                 (1, -1))).sum(axis=1)
        rmse = mx.nd.square(real_pred_ratings - train_gt_ratings).sum()
        count_rmse += rmse.asscalar()
        count_num += pred_ratings.shape[0]

        if iter_idx % args.train_log_interval == 0:
            train_loss_logger.log(iter=iter_idx,
                                  loss=count_loss / (iter_idx + 1),
                                  rmse=count_rmse / count_num)
            logging_str = "Iter={}, gnorm={:.3f}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format(
                iter_idx,
                avg_gnorm / args.train_log_interval,
                count_loss / iter_idx,
                count_rmse / count_num,
                np.average(dur),
            )
            avg_gnorm = 0
            count_rmse = 0
            count_num = 0

        if iter_idx % args.train_valid_interval == 0:
            valid_rmse = evaluate(args=args,
                                  net=net,
                                  dataset=dataset,
                                  segment="valid")
            valid_loss_logger.log(iter=iter_idx, rmse=valid_rmse)
            logging_str += ",\tVal RMSE={:.4f}".format(valid_rmse)

            if valid_rmse < best_valid_rmse:
                best_valid_rmse = valid_rmse
                no_better_valid = 0
                best_iter = iter_idx
                net.save_parameters(filename=os.path.join(
                    args.save_dir, "best_valid_net{}.params".format(
                        args.save_id)))
                test_rmse = evaluate(args=args,
                                     net=net,
                                     dataset=dataset,
                                     segment="test")
                best_test_rmse = test_rmse
                test_loss_logger.log(iter=iter_idx, rmse=test_rmse)
                logging_str += ", Test RMSE={:.4f}".format(test_rmse)
            else:
                no_better_valid += 1
                if (no_better_valid > args.train_early_stopping_patience
                        and trainer.learning_rate <= args.train_min_lr):
                    logging.info(
                        "Early stopping threshold reached. Stop training.")
                    break
                if no_better_valid > args.train_decay_patience:
                    new_lr = max(
                        trainer.learning_rate * args.train_lr_decay_factor,
                        args.train_min_lr)
                    if new_lr < trainer.learning_rate:
                        logging.info("\tChange the LR to %g" % new_lr)
                        trainer.set_learning_rate(new_lr)
                        no_better_valid = 0
        if iter_idx % args.train_log_interval == 0:
            print(logging_str)
    print("Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}".
          format(best_iter, best_valid_rmse, best_test_rmse))
    train_loss_logger.close()
    valid_loss_logger.close()
    test_loss_logger.close()
Example #20
0
def train_linear_one_epoch(train_loader, model, criterion, optimizer, config,
                           device):
    log_header = 'EPOCH {}'.format(epoch + 1)
    losses = AverageMeter('Loss', fmt=':.4f')
    top1 = AverageMeter('Top1', fmt=':4.2f')
    top5 = AverageMeter('Top5', fmt=':4.2f')
    lr = AverageMeter('Lr', fmt=":.4f")

    metric_logger = MetricLogger(delimeter=" | ")
    metric_logger.add_meter(losses)
    metric_logger.add_meter(top1)
    metric_logger.add_meter(top5)
    metric_logger.add_meter(lr)

    for step, (img, target) in enumerate(
            metric_logger.log_every(train_loader, config.system.print_freq,
                                    log_header)):
        img = img.to(device)
        target = target.to(device)
        logit = model_sl(img)

        loss = criterion(logit, target)
        acc1, acc5 = accuracy(logit, target, topk=(1, 5))
        lr_ = optimizer.param_groups[0]['lr']

        metric_logger.update(Loss=loss.detach().cpu().item(),
                             Top1=acc1.detach().cpu().item(),
                             Top5=acc5.detach().cpu().item(),
                             Lr=lr_)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
Example #21
0
def main():

    resume = True
    path = 'data/NYU_DEPTH'
    batch_size = 16
    epochs = 10000
    device = torch.device('cuda:0')
    print_every = 5
    # exp_name = 'resnet18_nodropout_new'
    exp_name = 'only_depth'
    # exp_name = 'normal_internel'
    # exp_name = 'sep'
    lr = 1e-5
    weight_decay = 0.0005
    log_dir = os.path.join('logs', exp_name)
    model_dir = os.path.join('checkpoints', exp_name)
    val_every = 16
    save_every = 16


    # tensorboard
    # remove old log is not to resume
    if not resume:
        if os.path.exists(log_dir):
            shutil.rmtree(log_dir)
            os.makedirs(log_dir)
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    tb = SummaryWriter(log_dir)
    tb.add_custom_scalars({
        'metrics': {
            'thres_1.25': ['Multiline', ['thres_1.25/train', 'thres_1.25/test']],
            'thres_1.25_2': ['Multiline', ['thres_1.25_2/train', 'thres_1.25_2/test']],
            'thres_1.25_3': ['Multiline', ['thres_1.25_3/train', 'thres_1.25_3/test']],
            'ard': ['Multiline', ['ard/train', 'ard/test']],
            'srd': ['Multiline', ['srd/train', 'srd/test']],
            'rmse_linear': ['Multiline', ['rmse_linear/train', 'rmse_linear/test']],
            'rmse_log': ['Multiline', ['rmse_log/train', 'rmse_log/test']],
            'rmse_log_invariant': ['Multiline', ['rmse_log_invariant/train', 'rmse_log_invariant/test']],
        }
    })
    
    
    # data loader
    dataset = NYUDepth(path, 'train')
    dataloader = DataLoader(dataset, batch_size, shuffle=True, num_workers=4)
    
    dataset_test = NYUDepth(path, 'test')
    dataloader_test = DataLoader(dataset_test, batch_size, shuffle=True, num_workers=4)
    
    
    # load model
    model = FCRN(True)
    model = model.to(device)
    
    
    # optimizer
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    
    start_epoch = 0
    if resume:
        model_path = os.path.join(model_dir, 'model.pth')
        if os.path.exists(model_path):
            print('Loading checkpoint from {}...'.format(model_path))
            # load model and optimizer
            checkpoint = torch.load(os.path.join(model_dir, 'model.pth'), map_location='cpu')
            model.load_state_dict(checkpoint['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            start_epoch = checkpoint['epoch']
            print('Model loaded.')
        else:
            print('No checkpoint found. Train from scratch')
    
    # training
    metric_logger = MetricLogger()
    
    end = time.perf_counter()
    max_iters = epochs * len(dataloader)
    
    def normal_loss(pred, normal, conf):
        """
        :param pred: (B, 3, H, W)
        :param normal: (B, 3, H, W)
        :param conf: 1
        """
        dot_prod = (pred * normal).sum(dim=1)
        # weighted loss, (B, )
        batch_loss = ((1 - dot_prod) * conf[:, 0]).sum(1).sum(1)
        # normalize, to (B, )
        batch_loss /= conf[:, 0].sum(1).sum(1)
        return batch_loss.mean()

    def consistency_loss(pred, cloud, normal, conf):
        """
        :param pred: (B, 1, H, W)
        :param normal: (B, 3, H, W)
        :param cloud: (B, 3, H, W)
        :param conf: (B, 1, H, W)
        """
        B, _, _, _ = normal.size()
        normal = normal.detach()
        cloud = cloud.clone()
        cloud[:, 2:3, :, :] = pred
        # algorithm: use a kernel
        kernel = torch.ones((1, 1, 7, 7), device=pred.device)
        kernel = -kernel
        kernel[0, 0, 3, 3] = 48
    
        cloud_0 = cloud[:, 0:1]
        cloud_1 = cloud[:, 1:2]
        cloud_2 = cloud[:, 2:3]
        diff_0 = F.conv2d(cloud_0, kernel, padding=6, dilation=2)
        diff_1 = F.conv2d(cloud_1, kernel, padding=6, dilation=2)
        diff_2 = F.conv2d(cloud_2, kernel, padding=6, dilation=2)
        # (B, 3, H, W)
        diff = torch.cat((diff_0, diff_1, diff_2), dim=1)
        # normalize
        diff = F.normalize(diff, dim=1)
        # (B, 1, H, W)
        dot_prod = (diff * normal).sum(dim=1, keepdim=True)
        # weighted mean over image
        dot_prod = torch.abs(dot_prod.view(B, -1))
        conf = conf.view(B, -1)
        loss = (dot_prod * conf).sum(1) / conf.sum(1)
        # mean over batch
        return loss.mean()
    
    def criterion(depth_pred, normal_pred, depth, normal, cloud, conf):
        mse_loss = F.mse_loss(depth_pred, depth)
        consis_loss = consistency_loss(depth_pred, cloud, normal_pred, conf)
        norm_loss = normal_loss(normal_pred, normal, conf)
        consis_loss = torch.zeros_like(norm_loss)
        
        return mse_loss, mse_loss, mse_loss
        # return mse_loss, consis_loss, norm_loss
        # return norm_loss, norm_loss, norm_loss
    
    print('Start training')
    for epoch in range(start_epoch, epochs):
        # train
        model.train()
        for i, data in enumerate(dataloader):
            start = end
            i += 1
            data = [x.to(device) for x in data]
            image, depth, normal, conf, cloud = data
            depth_pred, normal_pred = model(image)
            mse_loss, consis_loss, norm_loss = criterion(depth_pred, normal_pred, depth, normal, cloud, conf)
            loss = mse_loss + consis_loss + norm_loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # bookkeeping
            end = time.perf_counter()
            metric_logger.update(loss=loss.item())
            metric_logger.update(mse_loss=mse_loss.item())
            metric_logger.update(norm_loss=norm_loss.item())
            metric_logger.update(consis_loss=consis_loss.item())
            metric_logger.update(batch_time=end-start)

            
            if i % print_every == 0:
                # Compute eta. global step: starting from 1
                global_step = epoch * len(dataloader) + i
                seconds = (max_iters - global_step) * metric_logger['batch_time'].global_avg
                eta = datetime.timedelta(seconds=int(seconds))
                # to display: eta, epoch, iteration, loss, batch_time
                display_dict = {
                    'eta': eta,
                    'epoch': epoch,
                    'iter': i,
                    'loss': metric_logger['loss'].median,
                    'batch_time': metric_logger['batch_time'].median
                }
                display_str = [
                    'eta: {eta}s',
                    'epoch: {epoch}',
                    'iter: {iter}',
                    'loss: {loss:.4f}',
                    'batch_time: {batch_time:.4f}s',
                ]
                print(', '.join(display_str).format(**display_dict))
                
                # tensorboard
                min_depth = depth[0].min()
                max_depth = depth[0].max() * 1.25
                depth = (depth[0] - min_depth) / (max_depth - min_depth)
                depth_pred = (depth_pred[0] - min_depth) / (max_depth - min_depth)
                depth_pred = torch.clamp(depth_pred, min=0.0, max=1.0)
                normal = (normal[0] + 1) / 2
                normal_pred = (normal_pred[0] + 1) / 2
                conf = conf[0]
                
                tb.add_scalar('train/loss', metric_logger['loss'].median, global_step)
                tb.add_scalar('train/mse_loss', metric_logger['mse_loss'].median, global_step)
                tb.add_scalar('train/consis_loss', metric_logger['consis_loss'].median, global_step)
                tb.add_scalar('train/norm_loss', metric_logger['norm_loss'].median, global_step)
                
                tb.add_image('train/depth', depth, global_step)
                tb.add_image('train/normal', normal, global_step)
                tb.add_image('train/depth_pred', depth_pred, global_step)
                tb.add_image('train/normal_pred', normal_pred, global_step)
                tb.add_image('train/conf', conf, global_step)
                tb.add_image('train/image', image[0], global_step)
                
        if (epoch) % val_every == 0 and epoch != 0:
            # validate after each epoch
            validate(dataloader, model, device, tb, epoch, 'train')
            validate(dataloader_test, model, device, tb, epoch, 'test')
        if (epoch) % save_every == 0 and epoch != 0:
            to_save = {
                'optimizer': optimizer.state_dict(),
                'model': model.state_dict(),
                'epoch': epoch,
            }
            torch.save(to_save, os.path.join(model_dir, 'model.pth'))
Example #22
0
def train(cfg):

    print('Experiment name:', cfg.exp_name)
    print('Dataset:', cfg.dataset)
    print('Model name:', cfg.model)
    print('Resume:', cfg.resume)
    if cfg.resume:
        print('Checkpoint:',
              cfg.resume_ckpt if cfg.resume_ckpt else 'last checkpoint')
    print('Using device:', cfg.device)
    if 'cuda' in cfg.device:
        print('Using parallel:', cfg.parallel)
    if cfg.parallel:
        print('Device ids:', cfg.device_ids)

    print('Loading data')

    if cfg.exp_name == 'table':
        data_set = np.load('{}/train/all_set_train.npy'.format(
            cfg.dataset_roots.TABLE))
        data_size = len(data_set)
    else:
        trainloader = get_dataloader(cfg, 'train')
        data_size = len(trainloader)
    if cfg.train.eval_on:
        valset = get_dataset(cfg, 'val')
        # valloader = get_dataloader(cfg, 'val')
        evaluator = get_evaluator(cfg)
    model = get_model(cfg)
    model = model.to(cfg.device)
    checkpointer = Checkpointer(osp.join(cfg.checkpointdir, cfg.exp_name),
                                max_num=cfg.train.max_ckpt)
    model.train()

    optimizer_fg, optimizer_bg = get_optimizers(cfg, model)

    start_epoch = 0
    start_iter = 0
    global_step = 0
    if cfg.resume:
        checkpoint = checkpointer.load_last(cfg.resume_ckpt, model,
                                            optimizer_fg, optimizer_bg)
        if checkpoint:
            start_epoch = checkpoint['epoch']
            global_step = checkpoint['global_step'] + 1
    if cfg.parallel:
        model = nn.DataParallel(model, device_ids=cfg.device_ids)

    writer = SummaryWriter(log_dir=os.path.join(cfg.logdir, cfg.exp_name),
                           flush_secs=30,
                           purge_step=global_step)
    vis_logger = get_vislogger(cfg)
    metric_logger = MetricLogger()

    print('Start training')
    end_flag = False
    for epoch in range(start_epoch, cfg.train.max_epochs):
        if end_flag:
            break
        if cfg.exp_name == 'table':
            # creates indexes and shuffles them. So it can acces the data
            idx_set = np.arange(data_size)
            np.random.shuffle(idx_set)
            idx_set = np.split(idx_set, len(idx_set) / cfg.train.batch_size)
            data_to_enumerate = idx_set
        else:
            trainloader = get_dataloader(cfg, 'train')
            data_to_enumerate = trainloader
            data_size = len(trainloader)

        start = time.perf_counter()
        for i, enumerated_data in enumerate(data_to_enumerate):

            end = time.perf_counter()
            data_time = end - start
            start = end

            model.train()
            if cfg.exp_name == 'table':
                data_i = data_set[enumerated_data]
                data_i = torch.from_numpy(data_i).float().to(cfg.device)
                data_i /= 255
                data_i = data_i.permute([0, 3, 1, 2])
                imgs = data_i
            else:

                imgs = enumerated_data
                imgs = imgs.to(cfg.device)

            loss, log = model(imgs, global_step)
            # In case of using DataParallel
            loss = loss.mean()
            optimizer_fg.zero_grad()
            optimizer_bg.zero_grad()
            loss.backward()
            if cfg.train.clip_norm:
                clip_grad_norm_(model.parameters(), cfg.train.clip_norm)

            optimizer_fg.step()

            # if cfg.train.stop_bg == -1 or global_step < cfg.train.stop_bg:
            optimizer_bg.step()

            end = time.perf_counter()
            batch_time = end - start

            metric_logger.update(data_time=data_time)
            metric_logger.update(batch_time=batch_time)
            metric_logger.update(loss=loss.item())

            if (global_step) % cfg.train.print_every == 0:
                start = time.perf_counter()
                log.update({
                    'loss': metric_logger['loss'].median,
                })
                vis_logger.train_vis(writer, log, global_step, 'train')
                end = time.perf_counter()

                print(
                    'exp: {}, epoch: {}, iter: {}/{}, global_step: {}, loss: {:.2f}, batch time: {:.4f}s, data time: {:.4f}s, log time: {:.4f}s'
                    .format(cfg.exp_name, epoch + 1, i + 1, data_size,
                            global_step, metric_logger['loss'].median,
                            metric_logger['batch_time'].avg,
                            metric_logger['data_time'].avg, end - start))
            if (global_step) % cfg.train.create_image_every == 0:
                vis_logger.test_create_image(
                    log,
                    '../output/{}_img_{}.png'.format(cfg.dataset, global_step))
            if (global_step) % cfg.train.save_every == 0:
                start = time.perf_counter()
                checkpointer.save_last(model, optimizer_fg, optimizer_bg,
                                       epoch, global_step)
                print('Saving checkpoint takes {:.4f}s.'.format(
                    time.perf_counter() - start))

            if (global_step) % cfg.train.eval_every == 0 and cfg.train.eval_on:
                pass
                '''print('Validating...')
                start = time.perf_counter()
                checkpoint = [model, optimizer_fg, optimizer_bg, epoch, global_step]
                if cfg.exp_name == 'table':
                    evaluator.train_eval(model, None, None, writer, global_step, cfg.device, checkpoint, checkpointer)
                else:
                    evaluator.train_eval(model, valset, valset.bb_path, writer, global_step, cfg.device, checkpoint, checkpointer)

                print('Validation takes {:.4f}s.'.format(time.perf_counter() - start))'''

            start = time.perf_counter()
            global_step += 1
            if global_step > cfg.train.max_steps:
                end_flag = True
                break
Example #23
0
def train(args):
    dataset = MovieLens(args.data_name,
                        args.ctx,
                        use_one_hot_fea=args.use_one_hot_fea,
                        symm=args.gcn_agg_norm_symm)
    print("Loading data finished ...\n")

    args.src_key = dataset.name_user
    args.dst_key = dataset.name_movie
    args.src_in_units = dataset.user_feature.shape[1]
    args.dst_in_units = dataset.movie_feature.shape[1]
    args.nratings = dataset.possible_rating_values.size

    ### build the net
    net = Net(args=args)
    net.initialize(init=mx.init.Xavier(factor_type='in'), ctx=args.ctx)
    net.hybridize()
    if args.gen_r_use_classification:
        nd_possible_rating_values = mx.nd.array(dataset.possible_rating_values,
                                                ctx=args.ctx,
                                                dtype=np.float32)
        rating_loss_net = gluon.loss.SoftmaxCELoss()
    else:
        rating_mean = dataset.train_rating_values.mean()
        rating_std = dataset.train_rating_values.std()
        rating_loss_net = gluon.loss.L2Loss()
    rating_loss_net.hybridize()
    trainer = gluon.Trainer(net.collect_params(), args.train_optimizer,
                            {'learning_rate': args.train_lr})
    print("Loading network finished ...\n")

    ### perpare training data
    train_rating_pairs = mx.nd.array(dataset.train_rating_pairs,
                                     ctx=args.ctx,
                                     dtype=np.int64)
    train_gt_ratings = mx.nd.array(dataset.train_rating_values,
                                   ctx=args.ctx,
                                   dtype=np.float32)

    ### prepare the logger
    train_loss_logger = MetricLogger(
        ['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'],
        os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id))
    valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                     os.path.join(
                                         args.save_dir,
                                         'valid_loss%d.csv' % args.save_id))
    test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                    os.path.join(
                                        args.save_dir,
                                        'test_loss%d.csv' % args.save_id))

    ### declare the loss information
    best_valid_rmse = np.inf
    no_better_valid = 0
    best_iter = -1
    avg_gnorm = 0
    count_rmse = 0
    count_num = 0
    count_loss = 0

    print("Start training ...")
    for iter_idx in range(1, args.train_max_iter):
        if args.gen_r_use_classification:
            train_gt_label = mx.nd.array(np.searchsorted(
                dataset.possible_rating_values, dataset.train_rating_values),
                                         ctx=args.ctx,
                                         dtype=np.int32)
        with mx.autograd.record():
            pred_ratings = net(dataset.train_graph, train_rating_pairs)
            if args.gen_r_use_classification:
                loss = rating_loss_net(pred_ratings, train_gt_label).mean()
            else:
                loss = rating_loss_net(
                    mx.nd.reshape(pred_ratings, shape=(-1, )),
                    (train_gt_ratings - rating_mean) / rating_std).mean()
            #loss.wait_to_read()
            loss.backward()

        count_loss += loss.asscalar()
        gnorm = params_clip_global_norm(net.collect_params(),
                                        args.train_grad_clip, args.ctx)
        avg_gnorm += gnorm
        trainer.step(1.0)  #, ignore_stale_grad=True)

        if iter_idx == 1:
            print("Total #Param of net: %d" % (gluon_total_param_num(net)))
            print(
                gluon_net_info(net,
                               save_path=os.path.join(
                                   args.save_dir, 'net%d.txt' % args.save_id)))

        if args.gen_r_use_classification:
            real_pred_ratings = (mx.nd.softmax(pred_ratings, axis=1) *
                                 nd_possible_rating_values.reshape(
                                     (1, -1))).sum(axis=1)
            rmse = mx.nd.square(real_pred_ratings - train_gt_ratings).sum()
        else:
            rmse = mx.nd.square(
                pred_ratings.reshape((-1, )) * rating_std + rating_mean -
                train_gt_ratings).sum()
        count_rmse += rmse.asscalar()
        count_num += pred_ratings.shape[0]

        if iter_idx % args.train_log_interval == 0:
            train_loss_logger.log(iter=iter_idx,
                                  loss=count_loss / (iter_idx + 1),
                                  rmse=count_rmse / count_num)
            logging_str = "Iter={}, gnorm={:.3f}, loss={:.4f}, rmse={:.4f}".format(
                iter_idx, avg_gnorm / args.train_log_interval,
                count_loss / iter_idx, count_rmse / count_num)
            avg_gnorm = 0
            count_rmse = 0
            count_num = 0

        if iter_idx % args.train_valid_interval == 0:
            valid_rmse = evaluate(args=args,
                                  net=net,
                                  dataset=dataset,
                                  segment='valid')
            valid_loss_logger.log(iter=iter_idx, rmse=valid_rmse)
            logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse)

            if valid_rmse < best_valid_rmse:
                best_valid_rmse = valid_rmse
                no_better_valid = 0
                best_iter = iter_idx
                #net.save_parameters(filename=os.path.join(args.save_dir, 'best_valid_net{}.params'.format(args.save_id)))
                test_rmse = evaluate(args=args,
                                     net=net,
                                     dataset=dataset,
                                     segment='test')
                best_test_rmse = test_rmse
                test_loss_logger.log(iter=iter_idx, rmse=test_rmse)
                logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
            else:
                no_better_valid += 1
                if no_better_valid > args.train_early_stopping_patience\
                    and trainer.learning_rate <= args.train_min_lr:
                    logging.info(
                        "Early stopping threshold reached. Stop training.")
                    break
                if no_better_valid > args.train_decay_patience:
                    new_lr = max(
                        trainer.learning_rate * args.train_lr_decay_factor,
                        args.train_min_lr)
                    if new_lr < trainer.learning_rate:
                        logging.info("\tChange the LR to %g" % new_lr)
                        trainer.set_learning_rate(new_lr)
                        no_better_valid = 0
        if iter_idx % args.train_log_interval == 0:
            print(logging_str)
    print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.
          format(best_iter, best_valid_rmse, best_test_rmse))
    train_loss_logger.close()
    valid_loss_logger.close()
    test_loss_logger.close()
Example #24
0
def train(args):
    print(args)
    dataset = MovieLens(args.data_name, args.ctx, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm,
                        test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio)
    print("Loading data finished ...\n")

    args.src_in_units = dataset.user_feature_shape[1]
    args.dst_in_units = dataset.movie_feature_shape[1]
    args.rating_vals = dataset.possible_rating_values

    ### build the net
    net = Net(args=args)
    net.initialize(init=mx.init.Xavier(factor_type='in'), ctx=args.ctx)
    net.hybridize()
    rating_loss_net = gluon.loss.SoftmaxCELoss()
    rating_loss_net.hybridize()
    trainer = gluon.Trainer(net.collect_params(), args.train_optimizer, {'learning_rate': args.train_lr})
    print("Loading network finished ...\n")

    ### perpare training data
    train_gt_labels = dataset.train_labels
    train_gt_ratings = dataset.train_truths

    ### prepare the logger
    train_loss_logger = MetricLogger(['iter', 'idx', 'loss', 'rmse'], ['%d', '%d', '%.4f', '%.4f'],
                                     os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id))
    valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                     os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id))
    test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                    os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id))

    ### declare the loss information
    best_valid_rmse = np.inf
    no_better_valid = 0
    best_iter = -1

    enc_graph = dataset.train_enc_graph
    nd_possible_rating_values = mx.nd.array(dataset.possible_rating_values, ctx=args.ctx, dtype=np.float32)
    g_user_fea = mx.nd.zeros((dataset.num_user,))
    g_movie_fea = mx.nd.zeros((dataset.num_movie,))
    train_truths = dataset.train_truths
    train_labels = dataset.train_labels

    print("Start training ...")
    dur = []

    for iter_idx in range(1, args.train_max_iter):
        if iter_idx > 3:
            t0 = time.time()

        num_edges = dataset.train_truths.shape[0]
        seed = mx.nd.arange(num_edges, dtype='int64')
        edges = mx.nd.shuffle(seed)
        # each iteration will go through all edges
        for sample_idx in range(0, (num_edges + args.minibatch_size - 1) // args.minibatch_size):
            edge_ids = edges[sample_idx * args.minibatch_size: (sample_idx + 1) * args.minibatch_size if (sample_idx + 1) * args.minibatch_size < num_edges else num_edges]
            head_ids, tail_ids = dataset.train_dec_graph.find_edges(edge_ids.asnumpy())

            head_subgraphs = {}
            tail_subgraphs = {}
            head_node_ids = np.unique(head_ids.asnumpy())
            tail_node_ids = np.unique(tail_ids.asnumpy())
            for i, _ in enumerate(args.rating_vals):
                t = enc_graph.canonical_etypes[i * 2]
                rev_t = enc_graph.canonical_etypes[i * 2 + 1]

                head_in_edges = enc_graph.in_edges(head_node_ids, 'eid', etype=rev_t)
                tail_in_edges = enc_graph.in_edges(tail_node_ids, 'eid', etype=t)

                if head_in_edges.shape[0] > 0:
                    head_subgraphs[rev_t] = head_in_edges

                if tail_in_edges.shape[0] > 0:
                    tail_subgraphs[t] = tail_in_edges

            head_subgraph = enc_graph.edge_subgraph(head_subgraphs, preserve_nodes=True)
            tail_subgraph = enc_graph.edge_subgraph(tail_subgraphs, preserve_nodes=True)
            edge_ids = edge_ids.as_in_context(args.ctx)
            true_relation_ratings = train_truths[edge_ids]
            true_relation_labels = train_labels[edge_ids]

            head_NID = head_subgraph.nodes['user'].data[dgl.NID]
            tail_NID = tail_subgraph.nodes['movie'].data[dgl.NID]

            g_user_fea[head_NID] = mx.nd.arange(head_NID.shape[0], dtype='int32')
            g_movie_fea[tail_NID] = mx.nd.arange(tail_NID.shape[0], dtype='int32')

            true_head_idx = g_user_fea[head_ids].as_in_context(args.ctx)
            true_tail_idx = g_movie_fea[tail_ids].as_in_context(args.ctx)

            with mx.autograd.record():
                pred_ratings = net(head_subgraph, tail_subgraph,
                                   true_head_idx, true_tail_idx)
                loss = rating_loss_net(pred_ratings, true_relation_labels).mean()
                loss.backward()
            gnorm = params_clip_global_norm(net.collect_params(), args.train_grad_clip, args.ctx)
            trainer.step(1.0, ignore_stale_grad=True)
            real_pred_ratings = (mx.nd.softmax(pred_ratings, axis=1) *
                             nd_possible_rating_values.reshape((1, -1))).sum(axis=1)
            rmse = mx.nd.square(real_pred_ratings - true_relation_ratings).mean().asscalar()
            rmse = np.sqrt(rmse)
            loss = loss.asscalar()
            if sample_idx % 100 == 0:
                train_loss_logger.log(iter=iter_idx, idx=sample_idx,
                                  loss=loss, rmse=rmse)
                print("Iter={}, sample_idx={}, gnorm={:.3f}, loss={:.4f}, rmse={:.4f}".format(iter_idx,
                    sample_idx, gnorm, loss, rmse))
            gc.collect()

        if iter_idx > 3:
            dur.append(time.time() - t0)

        if iter_idx == 1:
            print("Total #Param of net: %d" % (gluon_total_param_num(net)))
            print(gluon_net_info(net, save_path=os.path.join(args.save_dir, 'net%d.txt' % args.save_id)))

        if iter_idx % args.train_log_interval == 0:
           logging_str = "Iter={}, time={:.4f}".format(
                iter_idx, np.average(dur))

        if iter_idx % args.train_valid_interval == 0:
            valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid')
            valid_loss_logger.log(iter = iter_idx, rmse = valid_rmse)
            logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse)

            if valid_rmse < best_valid_rmse:
                best_valid_rmse = valid_rmse
                no_better_valid = 0
                best_iter = iter_idx
                #net.save_parameters(filename=os.path.join(args.save_dir, 'best_valid_net{}.params'.format(args.save_id)))
                test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test')
                best_test_rmse = test_rmse
                test_loss_logger.log(iter=iter_idx, rmse=test_rmse)
                logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
            else:
                no_better_valid += 1
                if no_better_valid > args.train_early_stopping_patience\
                    and trainer.learning_rate <= args.train_min_lr:
                    logging.info("Early stopping threshold reached. Stop training.")
                    break
                if no_better_valid > args.train_decay_patience:
                    new_lr = max(trainer.learning_rate * args.train_lr_decay_factor, args.train_min_lr)
                    if new_lr < trainer.learning_rate:
                        logging.info("\tChange the LR to %g" % new_lr)
                        trainer.set_learning_rate(new_lr)
                        no_better_valid = 0
        if iter_idx  % args.train_log_interval == 0:
            print(logging_str)
    print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.format(
        best_iter, best_valid_rmse, best_test_rmse))
    train_loss_logger.close()
    valid_loss_logger.close()
    test_loss_logger.close()
Example #25
0
File: train.py Project: ghk829/dgl
def train(args):
    print(args)
    if args.data_name == 'jukebox':
        dataset = JukeboxDataset('dataset/listen_count.txt')
    else:
        dataset = MovieLens(args.data_name, args.device, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm,
                        test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio)
    print("Loading data finished ...\n")

    args.src_in_units = dataset.user_feature_shape[1]
    args.dst_in_units = dataset.movie_feature_shape[1]
    args.rating_vals = dataset.possible_rating_values

    ### build the net
    net = Net(args=args)
    net = net.to(args.device)
    nd_possible_rating_values = th.FloatTensor(dataset.possible_rating_values).to(args.device)
    rating_loss_net = nn.MSELoss()
    learning_rate = args.train_lr
    optimizer = get_optimizer(args.train_optimizer)(net.parameters(), lr=learning_rate)
    print("Loading network finished ...\n")

    ### perpare training data
    train_gt_labels = dataset.train_labels
    train_gt_ratings = dataset.train_truths

    ### prepare the logger
    train_loss_logger = MetricLogger(['iter', 'loss'], ['%d', '%.4f', '%.4f'],
                                     os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id))
    valid_loss_logger = MetricLogger(['iter', 'ndcg','precision','recall','fscore','support'], ['%d','%.4f', '%.4f','%s','%s','%s','%s'],
                                     os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id))
    test_loss_logger = MetricLogger(['iter'], ['%d', '%.4f'],
                                    os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id))

    ### declare the loss information
    best_valid_rmse = np.inf
    no_better_valid = 0
    best_iter = -1
    count_rmse = 1
    count_num = 1
    count_loss = 0
    count_step = 0

    dataset.train_enc_graph = dataset.train_enc_graph.int().to(args.device)
    dataset.train_dec_graph = dataset.train_dec_graph.int().to(args.device)
    dataset.valid_enc_graph = dataset.train_enc_graph
    dataset.valid_dec_graph = dataset.valid_dec_graph.int().to(args.device)
    dataset.test_enc_graph = dataset.test_enc_graph.int().to(args.device)
    dataset.test_dec_graph = dataset.test_dec_graph.int().to(args.device)

    def batch(iterable, n=1):
        current_batch = []
        for item in iterable:
            current_batch.append(item)
            if len(current_batch) == n:
                yield current_batch
                current_batch = []
        if current_batch:
            yield current_batch
    batches = []
    print("Start training ...")
    dur = []
    for iter_idx in range(1, args.train_max_iter):
        if iter_idx > 3:
            t0 = time.time()
        net.train()
        unique_item_list = dataset.train['item_id'].unique().tolist()

        ufeat, ifeat = net.encoder(dataset.train_enc_graph,
                                   dataset.user_feature, dataset.movie_feature)
        from tqdm import tqdm
        if iter_idx ==1:
            for row in tqdm(list(dataset.train.itertuples())):
                user, item, rating = row.user_id, row.item_id, row.rating
                userid = dataset.global_user_id_map[user]
                observed = dataset.train[dataset.train['user_id'] == user]['item_id'].unique().tolist()
                negatives = set()
                while len(negatives) < 1:
                    sample = random.choice(unique_item_list)
                    if sample not in observed:
                        negatives.add(sample)
                        batches.append((userid, dataset.global_item_id_map[item], dataset.global_item_id_map[sample]))

        for bt in tqdm(list(batch(batches, 2**14))):
            uidfeat = ufeat[[e[0] for e in bt]]
            posfeat = ifeat[[e[1] for e in bt]]
            negfeat = ifeat[[e[2] for e in bt]]

            pos_scores = uidfeat @ net.decoder.Q @ posfeat.T
            neg_scores = uidfeat @ net.decoder.Q @ negfeat.T

            lmbd = 1e-5
            mf_loss = -nn.BCELoss()(th.sigmoid(pos_scores), th.ones_like(pos_scores)) + nn.LogSigmoid()(pos_scores - neg_scores).mean()
            mf_loss = -1 * mf_loss

            regularizer = (th.norm(uidfeat,dim=1)**2).mean() + (th.norm(posfeat,dim=1)**2).mean() + (th.norm(negfeat,dim=1)**2).mean() + (th.norm(net.decoder.Q))
            emb_loss = lmbd * regularizer
            print('mf_loss', mf_loss)
            print('emb_loss', emb_loss)
            optimizer.zero_grad()
            loss = mf_loss + emb_loss
            count_loss += loss.item()
            loss.backward()
            nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip)
            optimizer.step()
            ufeat, ifeat = net.encoder(dataset.train_enc_graph,
                                       dataset.user_feature, dataset.movie_feature)
            count_step += 1

        print('train done')

        if iter_idx > 3:
            dur.append(time.time() - t0)

        if iter_idx == 1:
            print("Total #Param of net: %d" % (torch_total_param_num(net)))
            print(torch_net_info(net, save_path=os.path.join(args.save_dir, 'net%d.txt' % args.save_id)))

        if iter_idx % args.train_log_interval == 0:
            train_loss_logger.log(iter=iter_idx,
                                  loss=count_loss / (count_step + 1))
            logging_str = "Iter={}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format(
                iter_idx, count_loss/(count_step + 1), count_rmse/count_num,
                np.average(dur))
            count_rmse = 1
            count_num = 1

        if iter_idx % args.train_valid_interval == 0:
            valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid')
            precision, recall, fscore, support = evaluate_others(args=args, net=net, dataset=dataset, segment='valid')
            ndcg = evaluate_ndcg(args=args, net=net, dataset=dataset, segment='valid')
            print('ndcg', ndcg, 'precision', precision, 'recall', recall, 'fscore', fscore, 'support', support)
            valid_loss_logger.log(iter=iter_idx, ndcg=ndcg, precision=precision, recall=recall, fscore=fscore,
                                  support=support)
            logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse)

            if valid_rmse < best_valid_rmse:
                best_valid_rmse = valid_rmse
                no_better_valid = 0
                best_iter = iter_idx
                test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test')
                best_test_rmse = test_rmse
                test_loss_logger.log(iter=iter_idx)
                logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
            else:
                no_better_valid += 1
                if no_better_valid > args.train_early_stopping_patience\
                    and learning_rate <= args.train_min_lr:
                    logging.info("Early stopping threshold reached. Stop training.")
                    break
                if no_better_valid > args.train_decay_patience:
                    new_lr = max(learning_rate * args.train_lr_decay_factor, args.train_min_lr)
                    if new_lr < learning_rate:
                        learning_rate = new_lr
                        logging.info("\tChange the LR to %g" % new_lr)
                        for p in optimizer.param_groups:
                            p['lr'] = learning_rate
                        no_better_valid = 0
        if iter_idx  % args.train_log_interval == 0:
            print(logging_str)
    print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.format(
        best_iter, best_valid_rmse, best_test_rmse))
    train_loss_logger.close()
    valid_loss_logger.close()
    test_loss_logger.close()
Example #26
0
def train(cfg):
    
    print('Experiment name:', cfg.exp_name)
    print('Dataset:', cfg.dataset)
    print('Model name:', cfg.model)
    print('Resume:', cfg.resume)
    if cfg.resume:
        print('Checkpoint:', cfg.resume_ckpt if cfg.resume_ckpt else 'last checkpoint')
    print('Using device:', cfg.device)
    if 'cuda' in cfg.device:
        print('Using parallel:', cfg.parallel)
    if cfg.parallel:
        print('Device ids:', cfg.device_ids)
    
    print('Loading data')

    trainloader = get_dataloader(cfg, 'train')
    if cfg.train.eval_on:
        valset = get_dataset(cfg, 'val')
        # valloader = get_dataloader(cfg, 'val')
        evaluator = get_evaluator(cfg)
    model = get_model(cfg)
    model = model.to(cfg.device)
    checkpointer = Checkpointer(osp.join(cfg.checkpointdir, cfg.exp_name), max_num=cfg.train.max_ckpt)
    model.train()

    optimizer_fg, optimizer_bg = get_optimizers(cfg, model)

    start_epoch = 0
    start_iter = 0
    global_step = 0
    if cfg.resume:
        checkpoint = checkpointer.load_last(cfg.resume_ckpt, model, optimizer_fg, optimizer_bg)
        if checkpoint:
            start_epoch = checkpoint['epoch']
            global_step = checkpoint['global_step'] + 1
    if cfg.parallel:
        model = nn.DataParallel(model, device_ids=cfg.device_ids)
    
    writer = SummaryWriter(log_dir=os.path.join(cfg.logdir, cfg.exp_name), flush_secs=30, purge_step=global_step)
    vis_logger = get_vislogger(cfg)
    metric_logger =  MetricLogger()

    print('Start training')
    end_flag = False
    for epoch in range(start_epoch, cfg.train.max_epochs):
        if end_flag:
            break
    
        start = time.perf_counter()
        for i, data in enumerate(trainloader):
        
            end = time.perf_counter()
            data_time = end - start
            start = end
        
            model.train()
            imgs = data
            imgs = imgs.to(cfg.device)
            loss, log = model(imgs, global_step)
            # In case of using DataParallel
            loss = loss.mean()
            optimizer_fg.zero_grad()
            optimizer_bg.zero_grad()
            loss.backward()
            if cfg.train.clip_norm:
                clip_grad_norm_(model.parameters(), cfg.train.clip_norm)
        
            optimizer_fg.step()
        
            # if cfg.train.stop_bg == -1 or global_step < cfg.train.stop_bg:
            optimizer_bg.step()
        
            end = time.perf_counter()
            batch_time = end - start
        
            metric_logger.update(data_time=data_time)
            metric_logger.update(batch_time=batch_time)
            metric_logger.update(loss=loss.item())
        
            if (global_step) % cfg.train.print_every == 0:
                start = time.perf_counter()
                log.update({
                    'loss': metric_logger['loss'].median,
                })
                vis_logger.train_vis(writer, log, global_step, 'train')
                end = time.perf_counter()
            
                print(
                    'exp: {}, epoch: {}, iter: {}/{}, global_step: {}, loss: {:.2f}, batch time: {:.4f}s, data time: {:.4f}s, log time: {:.4f}s'.format(
                        cfg.exp_name, epoch + 1, i + 1, len(trainloader), global_step, metric_logger['loss'].median,
                        metric_logger['batch_time'].avg, metric_logger['data_time'].avg, end - start))
                
            if (global_step) % cfg.train.save_every == 0:
                start = time.perf_counter()
                checkpointer.save_last(model, optimizer_fg, optimizer_bg, epoch, global_step)
                print('Saving checkpoint takes {:.4f}s.'.format(time.perf_counter() - start))
        
            if (global_step) % cfg.train.eval_every == 0 and cfg.train.eval_on:
                print('Validating...')
                start = time.perf_counter()
                checkpoint = [model, optimizer_fg, optimizer_bg, epoch, global_step]
                evaluator.train_eval(model, valset, valset.bb_path, writer, global_step, cfg.device, checkpoint, checkpointer)
                print('Validation takes {:.4f}s.'.format(time.perf_counter() - start))
        
            start = time.perf_counter()
            global_step += 1
            if global_step > cfg.train.max_steps:
                end_flag = True
                break
Example #27
0
def training(model, data_loader, optimizer, scheduler, checkpointer, device,
             checkpoint_period, arguments):
    logger = logging.getLogger("RetinaNet.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()
    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        loss_dict = model(images, targets)

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()

        optimizer.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
        if iteration == max_iter:
            checkpointer.save("model_final", **arguments)

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))
Example #28
0
def train(inputs, outputs, args, logger):
    """
     :param:
     - inputs: (list) 作为输入的tensor, 它是由preprocess.py处理得的
     - outputs: (tensor) 作为标注的tensor, 它是由preprocess.py处理得的
     - args: 一堆训练前规定好的参数
     - logger: 训练日志,可以把训练过程记录在./ckpt/log.txt
     :return: 训练结束
     """
    # 创建数据集
    # inputs[0] (50000,1024)即(data_num,max_input_len)
    # outputs (50000) 即(data_num)
    torch_dataset = Data.TensorDataset(inputs[0], inputs[1], inputs[2], outputs)
    loader = Data.DataLoader(dataset=torch_dataset, batch_size=args.batch_size, shuffle=True)
    logger.info('[1] Building model')
    # 查看运行训练脚本时,所用的设备,如果有cuda,就用cuda
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # 构造 model
    model = AlbertClassifierModel(num_topics=args.num_topics,
                                  out_channels=args.out_channels,
                                  max_input_len=args.max_input_len,
                                  kernel_size=args.kernel_size,
                                  dropout=args.dropout).to(device)
    model_kwargs = {k: getattr(args, k) for k in
                    {'num_topics', 'out_channels', 'max_input_len', 'kernel_size', 'dropout'}
                    }
    logger.info(model)
    # 优化器
    if args.optim == 'adam':
        optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    elif args.optim == 'adamw':
        optimizer = optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    elif args.optim == 'sgd':
        optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.weight_decay)
    meters = MetricLogger(delimiter="  ")
    # BCEWithLogitsLoss是不需要sigmoid的二分类损失函数
    criterion = nn.CrossEntropyLoss()
    # scheduler,在schedule_step的时候,把学习率乘0.1,目前只在第一个step做了这个下降
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [args.schedule_step], gamma=0.1)
    logger.info('[2] Start training......')
    for epoch_num in range(args.max_epoch):
        # example_num:一个epoch需要训练多少个batch
        example_num = outputs.shape[0] // args.batch_size
        for batch_iter, (input_ids, segments_tensor, attention_mask, label) in enumerate(loader):
            progress = epoch_num + batch_iter / example_num
            optimizer.zero_grad()
            batch_size = args.batch_size
            # 正向传播
            pred = model(input_ids.to(device).view(batch_size, -1),
                         attention_mask.to(device).view(batch_size, -1))
            # 处理 label
            if label.shape[0] != args.batch_size:
                logger.info('last dummy batch')
                break
            label = label.view(args.batch_size)
            label = label.to(device)
            loss = criterion(pred, label)

            # 反向传播
            loss.backward()
            optimizer.step()
            meters.update(loss=loss)
            # 每过0.01个epoch记录一次loss
            if (batch_iter + 1) % (example_num // 100) == 0:
                logger.info(
                    meters.delimiter.join(
                        [
                            "progress: {prog:.2f}",
                            "{meters}",
                        ]
                    ).format(
                        prog=progress,
                        meters=str(meters),
                    )
                )
                # debug模式,先去valid
                if args.debug:
                    break
        # 验证这个epoch的效果
        precision, score = validate(model, device, args)
        logger.info("val")
        logger.info("precision")
        logger.info(precision)
        logger.info("official score")
        logger.info(score)
        save = {
            'kwargs': model_kwargs,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
        }

        scheduler.step()
        # 每个epoch保留一个ckpt
        torch.save(save,
                   os.path.join(args.save_dir, 'model_epoch%d_val%.3f.pt' % (epoch_num, score)))
Example #29
0
def train(args):
    print(args)
    # dataset = MovieLens(args.data_name, args.device, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm,
    #                     test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio)
    dataset = DataSetLoader(args.data_name, args.device,
                use_one_hot_fea=args.use_one_hot_fea,
                symm=args.gcn_agg_norm_symm,
                test_ratio=args.data_test_ratio,
                valid_ratio=args.data_valid_ratio)

    print("Loading data finished ...\n")

    args.src_in_units = dataset.user_feature_shape[1]
    args.dst_in_units = dataset.movie_feature_shape[1]
    args.rating_vals = dataset.possible_rating_values

    ### build the net
    net = Net(args=args)
    net = net.to(args.device)
    nd_possible_rating_values = th.FloatTensor(dataset.possible_rating_values).to(args.device)
    rating_loss_net = nn.CrossEntropyLoss()
    learning_rate = args.train_lr
    optimizer = get_optimizer(args.train_optimizer)(net.parameters(), lr=learning_rate)
    print("Loading network finished ...\n")

    ### perpare training data
    train_gt_labels = dataset.train_labels
    train_gt_ratings = dataset.train_truths

    ### prepare the logger
    train_loss_logger = MetricLogger(['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'],
                                     os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id))
    valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                     os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id))
    test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                    os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id))

    ### declare the loss information
    best_valid_rmse = np.inf
    no_better_valid = 0
    best_iter = -1
    count_rmse = 0
    count_num = 0
    count_loss = 0

    dataset.train_enc_graph = dataset.train_enc_graph.int().to(args.device)
    dataset.train_dec_graph = dataset.train_dec_graph.int().to(args.device)
    dataset.valid_enc_graph = dataset.train_enc_graph
    dataset.valid_dec_graph = dataset.valid_dec_graph.int().to(args.device)
    dataset.test_enc_graph = dataset.test_enc_graph.int().to(args.device)
    dataset.test_dec_graph = dataset.test_dec_graph.int().to(args.device)

    print("Start training ...")
    dur = []
    for iter_idx in range(1, args.train_max_iter):
        if iter_idx > 3:
            t0 = time.time()
        net.train()
        pred_ratings = net(dataset.train_enc_graph, dataset.train_dec_graph,
                           dataset.user_feature, dataset.movie_feature)
        loss = rating_loss_net(pred_ratings, train_gt_labels).mean()
        count_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip)
        optimizer.step()

        if iter_idx > 3:
            dur.append(time.time() - t0)

        if iter_idx == 1:
            print("Total #Param of net: %d" % (torch_total_param_num(net)))
            print(torch_net_info(net, save_path=os.path.join(args.save_dir, 'net%d.txt' % args.save_id)))

        real_pred_ratings = (th.softmax(pred_ratings, dim=1) *
                             nd_possible_rating_values.view(1, -1)).sum(dim=1)
        rmse = ((real_pred_ratings - train_gt_ratings) ** 2).sum()
        count_rmse += rmse.item()
        count_num += pred_ratings.shape[0]

        if iter_idx % args.train_log_interval == 0:
            train_loss_logger.log(iter=iter_idx,
                                  loss=count_loss/(iter_idx+1), rmse=count_rmse/count_num)
            logging_str = "Iter={}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format(
                iter_idx, count_loss/iter_idx, count_rmse/count_num,
                np.average(dur))
            count_rmse = 0
            count_num = 0

        if iter_idx % args.train_valid_interval == 0:
            valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid')
            valid_loss_logger.log(iter = iter_idx, rmse = valid_rmse)
            logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse)

            if valid_rmse < best_valid_rmse:
                best_valid_rmse = valid_rmse
                no_better_valid = 0
                best_iter = iter_idx
                test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test')
                best_test_rmse = test_rmse
                test_loss_logger.log(iter=iter_idx, rmse=test_rmse)
                logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
            else:
                no_better_valid += 1
                if no_better_valid > args.train_early_stopping_patience\
                    and learning_rate <= args.train_min_lr:
                    logging.info("Early stopping threshold reached. Stop training.")
                    break
                if no_better_valid > args.train_decay_patience:
                    new_lr = max(learning_rate * args.train_lr_decay_factor, args.train_min_lr)
                    if new_lr < learning_rate:
                        learning_rate = new_lr
                        logging.info("\tChange the LR to %g" % new_lr)
                        for p in optimizer.param_groups:
                            p['lr'] = learning_rate
                        no_better_valid = 0
        if iter_idx  % args.train_log_interval == 0:
            print(logging_str)
    print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.format(
        best_iter, best_valid_rmse, best_test_rmse))
    train_loss_logger.close()
    valid_loss_logger.close()
    test_loss_logger.close()
Example #30
0
def train_eval(args):
    logging_config(folder=args.save_dir,
                   name='log{:d}'.format(args.save_id),
                   no_console=False)
    logging.info(args)

    ### check context
    use_cuda = args.gpu >= 0 and th.cuda.is_available()
    if use_cuda:
        th.cuda.set_device(args.gpu)

    ### load data
    dataset = DataLoader(data_name=args.data_name, seed=args.seed)
    print(dataset)
    model = Model(use_KG=True,
                  input_node_dim=args.entity_embed_dim,
                  gnn_model=args.gnn_model,
                  num_gnn_layers=args.gnn_num_layer,
                  n_hidden=args.gnn_hidden_size,
                  dropout=args.dropout_rate,
                  n_entities=dataset.n_KG_entity,
                  n_relations=dataset.n_KG_relation,
                  relation_dim=args.relation_embed_dim,
                  reg_lambda_kg=args.regs,
                  reg_lambda_gnn=args.regs)
    if use_cuda:
        model.cuda()
    logging.info(model)
    ### optimizer
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    valid_metric_logger = MetricLogger(
        ['epoch', 'recall', 'ndcg', 'is_best'], ['%d', '%.5f', '%.5f', '%d'],
        os.path.join(args.save_dir, 'valid{:d}.csv'.format(args.save_id)))
    test_metric_logger = MetricLogger(
        ['epoch', 'recall', 'ndcg'], ['%d', '%.5f', '%.5f'],
        os.path.join(args.save_dir, 'test{:d}.csv'.format(args.save_id)))
    best_epoch = -1
    best_recall = 0.0

    train_g = dataset.train_g
    nid_th = th.LongTensor(train_g.ndata["id"])
    etype_th = th.LongTensor(train_g.edata["type"])
    if use_cuda:
        nid_th, etype_th = nid_th.cuda(), etype_th.cuda()
    train_g.ndata['id'] = nid_th
    train_g.edata['type'] = etype_th

    test_g = dataset.test_g
    nid_th = th.LongTensor(test_g.ndata["id"])
    etype_th = th.LongTensor(test_g.edata["type"])
    if use_cuda:
        nid_th, etype_th = nid_th.cuda(), etype_th.cuda()
    test_g.ndata['id'] = nid_th
    test_g.edata['type'] = etype_th

    item_id_range = th.LongTensor(dataset.item_id_range).cuda() if use_cuda \
        else th.LongTensor(dataset.item_id_range)

    for epoch in range(1, args.max_epoch + 1):
        ### train kg
        time1 = time()
        kg_sampler = dataset.KG_sampler(batch_size=args.batch_size_kg)
        iter = 0
        total_loss = 0.0
        for h, r, pos_t, neg_t, _ in kg_sampler:
            iter += 1
            model.train()
            h_th = th.LongTensor(h)
            r_th = th.LongTensor(r)
            pos_t_th = th.LongTensor(pos_t)
            neg_t_th = th.LongTensor(neg_t)
            if use_cuda:
                h_th, r_th, pos_t_th, neg_t_th = h_th.cuda(), r_th.cuda(
                ), pos_t_th.cuda(), neg_t_th.cuda()
            loss = model.transR(h_th, r_th, pos_t_th, neg_t_th)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss.item()
            if (iter % args.print_every) == 0 or iter == 1:
                logging.info("Epoch {:04d} Iter {:04d} | Loss {:.4f} ".format(
                    epoch, iter, total_loss / iter))
        logging.info('Time for KGE: {:.1f}s, loss {:.4f}'.format(
            time() - time1, total_loss / iter))

        ### train GNN
        if args.use_attention:
            time1 = time()
            print("Compute attention weight in train ...")
            with th.no_grad():
                A_w = model.compute_attention(train_g)
            train_g.edata['w'] = A_w
            print("Time: {:.2f}s".format(time() - time1))
        time1 = time()
        cf_sampler = dataset.CF_pair_sampler(batch_size=args.batch_size)
        iter = 0
        total_loss = 0.0
        for user_ids, item_pos_ids, item_neg_ids, _ in cf_sampler:
            iter += 1
            model.train()
            user_ids_th = th.LongTensor(user_ids)
            item_pos_ids_th = th.LongTensor(item_pos_ids)
            item_neg_ids_th = th.LongTensor(item_neg_ids)
            if use_cuda:
                user_ids_th, item_pos_ids_th, item_neg_ids_th = \
                    user_ids_th.cuda(), item_pos_ids_th.cuda(), item_neg_ids_th.cuda()
            embedding = model.gnn(train_g, train_g.ndata['id'])
            loss = model.get_loss(embedding, user_ids_th, item_pos_ids_th,
                                  item_neg_ids_th)
            loss.backward()
            # th.nn.utils.clip_grad_norm_(model.parameters(), args.grad_norm)  # clip gradients
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss.item()
            if (iter % args.print_every) == 0 or iter == 1:
                logging.info("Epoch {:04d} Iter {:04d} | Loss {:.4f} ".format(
                    epoch, iter, total_loss / iter))
        logging.info('Time for GNN: {:.1f}s, loss {:.4f}'.format(
            time() - time1, total_loss / iter))

        if epoch % args.evaluate_every == 0:
            time1 = time()
            val_recall, val_ndcg = eval(model, train_g,
                                        dataset.train_user_dict,
                                        dataset.valid_user_dict, item_id_range,
                                        use_cuda, args.use_attention)

            info = "Epoch{}, [{:.1f}s] val recall:{:.5f}, val ndcg:{:.5f}".format(
                epoch,
                time() - time1, val_recall, val_ndcg)
            # save best model
            if val_recall > best_recall:
                valid_metric_logger.log(epoch=epoch,
                                        recall=val_recall,
                                        ndcg=val_ndcg,
                                        is_best=1)
                best_recall = val_recall
                #best_ndcg = val_ndcg
                best_epoch = epoch
                time1 = time()
                test_recall, test_ndcg = eval(model, test_g,
                                              dataset.train_valid_user_dict,
                                              dataset.test_user_dict,
                                              item_id_range, use_cuda,
                                              args.use_attention)
                test_metric_logger.log(epoch=epoch,
                                       recall=test_recall,
                                       ndcg=test_ndcg)

                info += "\t[{:.1f}s] test recall:{:.5f}, test ndcg:{:.5f}".format(
                    time() - time1, test_recall, test_ndcg)
                #th.save({'state_dict': model.state_dict(), 'epoch': epoch}, model_state_file)
            else:
                valid_metric_logger.log(epoch=epoch,
                                        recall=val_recall,
                                        ndcg=val_ndcg,
                                        is_best=0)
                recall, ndcg = eval(model, test_g,
                                    dataset.train_valid_user_dict,
                                    dataset.test_user_dict, item_id_range,
                                    use_cuda, args.use_attention)
                print("test recall:{}, test_ndcg: {}".format(recall, ndcg))
            logging.info(info)

    logging.info(
        "Final test recall:{:.5f}, test ndcg:{:.5f}, best epoch:{}".format(
            test_recall, test_ndcg, best_epoch))