Ejemplo n.º 1
0
def main():
    log_hardware()
    args = parse_args()
    args.distributed, args.world_size = init_distributed(args.local_rank)
    log_args(args)

    if args.seed is not None:
        torch.manual_seed(args.seed)

    print("Saving results to {}".format(args.checkpoint_dir))
    if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir != '':
        os.makedirs(args.checkpoint_dir, exist_ok=True)

    # The default of np.random.choice is replace=True, so does pytorch random_()
    LOGGER.log(key=tags.PREPROC_HP_SAMPLE_EVAL_REPLACEMENT, value=True)
    LOGGER.log(key=tags.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT, value=True)
    LOGGER.log(key=tags.INPUT_STEP_EVAL_NEG_GEN)

    # sync workers before timing
    if args.distributed:
        torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0)
    torch.cuda.synchronize()

    main_start_time = time.time()
    LOGGER.log(key=tags.RUN_START)

    train_ratings = torch.load(args.data + '/train_ratings.pt',
                               map_location=torch.device('cuda:{}'.format(
                                   args.local_rank)))
    test_ratings = torch.load(args.data + '/test_ratings.pt',
                              map_location=torch.device('cuda:{}'.format(
                                  args.local_rank)))
    test_negs = torch.load(args.data + '/test_negatives.pt',
                           map_location=torch.device('cuda:{}'.format(
                               args.local_rank)))

    valid_negative = test_negs.shape[1]
    LOGGER.log(key=tags.PREPROC_HP_NUM_EVAL, value=valid_negative)

    nb_maxs = torch.max(train_ratings, 0)[0]
    nb_users = nb_maxs[0].item() + 1
    nb_items = nb_maxs[1].item() + 1
    LOGGER.log(key=tags.INPUT_SIZE, value=len(train_ratings))

    all_test_users = test_ratings.shape[0]

    test_users, test_items, dup_mask, real_indices = dataloading.create_test_data(
        test_ratings, test_negs, args)

    # make pytorch memory behavior more consistent later
    torch.cuda.empty_cache()

    LOGGER.log(key=tags.INPUT_BATCH_SIZE, value=args.batch_size)
    LOGGER.log(key=tags.INPUT_ORDER)  # we shuffled later with randperm

    # Create model
    model = NeuMF(nb_users,
                  nb_items,
                  mf_dim=args.factors,
                  mlp_layer_sizes=args.layers,
                  dropout=args.dropout)

    optimizer = FusedAdam(model.parameters(),
                          lr=args.learning_rate,
                          betas=(args.beta1, args.beta2),
                          eps=args.eps)

    criterion = nn.BCEWithLogitsLoss(
        reduction='none'
    )  # use torch.mean() with dim later to avoid copy to host
    # Move model and loss to GPU
    model = model.cuda()
    criterion = criterion.cuda()

    if args.opt_level == "O2":
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.opt_level,
                                          keep_batchnorm_fp32=False,
                                          loss_scale='dynamic')

    if args.distributed:
        model = DDP(model)

    local_batch = args.batch_size // args.world_size
    traced_criterion = torch.jit.trace(
        criterion.forward,
        (torch.rand(local_batch, 1), torch.rand(local_batch, 1)))

    print(model)
    print("{} parameters".format(admm_utils.count_parameters(model)))
    LOGGER.log(key=tags.OPT_LR, value=args.learning_rate)
    LOGGER.log(key=tags.OPT_NAME, value="Adam")
    LOGGER.log(key=tags.OPT_HP_ADAM_BETA1, value=args.beta1)
    LOGGER.log(key=tags.OPT_HP_ADAM_BETA2, value=args.beta2)
    LOGGER.log(key=tags.OPT_HP_ADAM_EPSILON, value=args.eps)
    LOGGER.log(key=tags.MODEL_HP_LOSS_FN, value=tags.VALUE_BCE)

    if args.load_checkpoint_path:
        state_dict = torch.load(args.load_checkpoint_path)
        state_dict = {
            k.replace('module.', ''): v
            for k, v in state_dict.items()
        }
        model.load_state_dict(state_dict)

    if args.mode == 'test':
        LOGGER.log(key=tags.EVAL_START, value=0)
        start = time.time()
        hr, ndcg = val_epoch(model,
                             test_users,
                             test_items,
                             dup_mask,
                             real_indices,
                             args.topk,
                             samples_per_user=valid_negative + 1,
                             num_user=all_test_users,
                             distributed=args.distributed)
        print('HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format(
            K=args.topk, hit_rate=hr, ndcg=ndcg))
        val_time = time.time() - start
        eval_size = all_test_users * (valid_negative + 1)
        eval_throughput = eval_size / val_time

        LOGGER.log(key=tags.EVAL_ACCURACY, value={"epoch": 0, "value": hr})
        LOGGER.log(key=tags.EVAL_STOP, value=0)
        LOGGER.log(key='best_eval_throughput', value=eval_throughput)
        return

    success = False
    max_hr = 0
    train_throughputs, eval_throughputs = [], []

    LOGGER.log(key=tags.TRAIN_LOOP)
    for epoch in range(args.epochs):

        LOGGER.log(key=tags.TRAIN_EPOCH_START, value=epoch)
        LOGGER.log(key=tags.INPUT_HP_NUM_NEG, value=args.negative_samples)
        LOGGER.log(key=tags.INPUT_STEP_TRAIN_NEG_GEN)

        begin = time.time()

        epoch_users, epoch_items, epoch_label = dataloading.prepare_epoch_train_data(
            train_ratings, nb_items, args)
        num_batches = len(epoch_users)
        for i in range(num_batches // args.grads_accumulated):
            for j in range(args.grads_accumulated):
                batch_idx = (args.grads_accumulated * i) + j
                user = epoch_users[batch_idx]
                item = epoch_items[batch_idx]
                label = epoch_label[batch_idx].view(-1, 1)

                outputs = model(user, item)
                loss = traced_criterion(outputs, label).float()
                loss = torch.mean(loss.view(-1), 0)

                if args.opt_level == "O2":
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()
            optimizer.step()

            for p in model.parameters():
                p.grad = None

        del epoch_users, epoch_items, epoch_label
        train_time = time.time() - begin
        begin = time.time()

        epoch_samples = len(train_ratings) * (args.negative_samples + 1)
        train_throughput = epoch_samples / train_time
        train_throughputs.append(train_throughput)
        LOGGER.log(key='train_throughput', value=train_throughput)
        LOGGER.log(key=tags.TRAIN_EPOCH_STOP, value=epoch)
        LOGGER.log(key=tags.EVAL_START, value=epoch)

        hr, ndcg = val_epoch(model,
                             test_users,
                             test_items,
                             dup_mask,
                             real_indices,
                             args.topk,
                             samples_per_user=valid_negative + 1,
                             num_user=all_test_users,
                             epoch=epoch,
                             distributed=args.distributed)

        val_time = time.time() - begin
        print(
            'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},'
            ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format(
                epoch=epoch,
                K=args.topk,
                hit_rate=hr,
                ndcg=ndcg,
                train_time=train_time,
                val_time=val_time))

        LOGGER.log(key=tags.EVAL_ACCURACY, value={"epoch": epoch, "value": hr})
        LOGGER.log(key=tags.EVAL_TARGET, value=args.threshold)
        LOGGER.log(key=tags.EVAL_STOP, value=epoch)

        eval_size = all_test_users * (valid_negative + 1)
        eval_throughput = eval_size / val_time
        eval_throughputs.append(eval_throughput)
        LOGGER.log(key='eval_throughput', value=eval_throughput)

        if hr > max_hr and args.local_rank == 0:
            max_hr = hr
            save_checkpoint_path = os.path.join(args.checkpoint_dir,
                                                'model.pth')
            print("New best hr! Saving the model to: ", save_checkpoint_path)
            torch.save(model.state_dict(), save_checkpoint_path)
            best_model_timestamp = time.time()

        if args.threshold is not None:
            if hr >= args.threshold:
                print("Hit threshold of {}".format(args.threshold))
                success = True
                break

    if args.local_rank == 0:
        LOGGER.log(key='best_train_throughput', value=max(train_throughputs))
        LOGGER.log(key='best_eval_throughput', value=max(eval_throughputs))
        LOGGER.log(key='best_accuracy', value=max_hr)
        LOGGER.log(key='time_to_target', value=time.time() - main_start_time)
        LOGGER.log(key='time_to_best_model',
                   value=best_model_timestamp - main_start_time)

        LOGGER.log(key=tags.RUN_STOP, value={"success": success})
        LOGGER.log(key=tags.RUN_FINAL)
Ejemplo n.º 2
0
def main():
    args = parse_args()
    print("init distributed")
    init_distributed(args)
    if args.rank == 0:
        wandb.init()
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=args.log_path),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)
        ])
    else:
        dllogger.init(backends=[])

    dllogger.log(data=vars(args), step='PARAMETER')

    torch.manual_seed(1)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # sync workers before timing
    if args.distributed:
        torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0)
    torch.cuda.synchronize()

    main_start_time = time.time()

    train_ratings = torch.load(args.data + '/train_ratings.pt',
                               map_location=torch.device('cuda:{}'.format(
                                   args.local_rank)))
    test_ratings = torch.load(args.data + '/test_ratings.pt',
                              map_location=torch.device('cuda:{}'.format(
                                  args.local_rank)))
    test_negs = torch.load(args.data + '/test_negatives.pt',
                           map_location=torch.device('cuda:{}'.format(
                               args.local_rank)))

    valid_negative = test_negs.shape[1]

    nb_maxs = torch.max(train_ratings, 0)[0]
    nb_users = nb_maxs[0].item() + 1
    nb_items = nb_maxs[1].item() + 1

    all_test_users = test_ratings.shape[0]

    test_users, test_items, dup_mask, real_indices = dataloading.create_test_data(
        test_ratings, test_negs, args)

    # make pytorch memory behavior more consistent later
    torch.cuda.empty_cache()

    # Create model
    model = NeuMF(nb_users,
                  nb_items,
                  mf_dim=args.factors,
                  mlp_layer_sizes=args.layers,
                  dropout=args.dropout).cuda()
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.learning_rate,
                                 betas=(args.beta1, args.beta2),
                                 eps=args.eps)

    criterion = nn.BCEWithLogitsLoss(reduction='none').cuda(
    )  # use torch.mean() with dim later to avoid copy to host
    # Move model and loss to GPU

    if args.distributed:
        model = DDP(model, device_ids=[args.local_rank])

    local_batch = args.batch_size // args.world_size
    traced_criterion = torch.jit.trace(
        criterion.forward,
        (torch.rand(local_batch, 1), torch.rand(local_batch, 1)))

    print(model)
    print("{} parameters".format(utils.count_parameters(model)))

    if args.load_checkpoint_path:
        state_dict = torch.load(args.load_checkpoint_path)
        state_dict = {
            k.replace('module.', ''): v
            for k, v in state_dict.items()
        }
        model.load_state_dict(state_dict)

    if args.mode == 'test':
        start = time.time()
        hr, ndcg = val_epoch(model,
                             test_users,
                             test_items,
                             dup_mask,
                             real_indices,
                             args.topk,
                             samples_per_user=valid_negative + 1,
                             num_user=all_test_users,
                             distributed=args.distributed)
        val_time = time.time() - start
        eval_size = all_test_users * (valid_negative + 1)
        eval_throughput = eval_size / val_time

        dllogger.log(step=tuple(),
                     data={
                         'best_eval_throughput': eval_throughput,
                         'hr@10': hr
                     })
        return

    max_hr = 0
    best_epoch = 0
    train_throughputs, eval_throughputs = [], []

    for epoch in range(args.epochs):

        begin = time.time()

        epoch_users, epoch_items, epoch_label = dataloading.prepare_epoch_train_data(
            train_ratings, nb_items, args)
        num_batches = len(epoch_users)
        for i in range(num_batches // args.grads_accumulated):
            for j in range(args.grads_accumulated):
                batch_idx = (args.grads_accumulated * i) + j
                user = epoch_users[batch_idx]
                item = epoch_items[batch_idx]
                label = epoch_label[batch_idx].view(-1, 1)

                outputs = model(user, item)
                loss = traced_criterion(outputs, label).float()
                loss = torch.mean(loss.view(-1), 0)

                loss.backward()
            optimizer.step()
            if args.rank == 0:
                wandb.log({"Test loss": loss})
            for p in model.parameters():
                p.grad = None

        del epoch_users, epoch_items, epoch_label
        train_time = time.time() - begin
        begin = time.time()

        epoch_samples = len(train_ratings) * (args.negative_samples + 1)
        train_throughput = epoch_samples / train_time
        train_throughputs.append(train_throughput)

        hr, ndcg = val_epoch(model,
                             test_users,
                             test_items,
                             dup_mask,
                             real_indices,
                             args.topk,
                             samples_per_user=valid_negative + 1,
                             num_user=all_test_users,
                             epoch=epoch,
                             distributed=args.distributed)

        val_time = time.time() - begin

        eval_size = all_test_users * (valid_negative + 1)
        eval_throughput = eval_size / val_time
        eval_throughputs.append(eval_throughput)

        dllogger.log(step=(epoch, ),
                     data={
                         'train_throughput': train_throughput,
                         'hr@10': hr,
                         'train_epoch_time': train_time,
                         'validation_epoch_time': val_time,
                         'eval_throughput': eval_throughput
                     })
        if args.rank == 0:
            wandb.log({"Test hit rate": hr})
            wandb.log({"Test train epoch time": train_time})
            wandb.log({"Test train throughput": train_throughput})
        if hr > max_hr and args.local_rank == 0:
            max_hr = hr
            best_epoch = epoch
            #            save_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.pth')
            print("New best hr!")
            #            torch.save(model.state_dict(), save_checkpoint_path)
            best_model_timestamp = time.time()

        if args.threshold is not None:
            if hr >= args.threshold:
                print("Hit threshold of {}".format(args.threshold))
                break

    if args.local_rank == 0:
        dllogger.log(data={
            'best_train_throughput':
            max(train_throughputs),
            'best_eval_throughput':
            max(eval_throughputs),
            'mean_train_throughput':
            np.mean(train_throughputs),
            'mean_eval_throughput':
            np.mean(eval_throughputs),
            'best_accuracy':
            max_hr,
            'best_epoch':
            best_epoch,
            'time_to_target':
            time.time() - main_start_time,
            'time_to_best_model':
            best_model_timestamp - main_start_time
        },
                     step=tuple())
Ejemplo n.º 3
0
def main():
    from grace_dl.dist.helper import timer, volume, tensor_bits

    args = parse_args()
    init_distributed(args)
    if args.weak_scaling:
        args.batch_size *= args.world_size
    init_wandb(args)
    init_grace(args)

    if args.local_rank == 0:
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=args.log_path),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)
        ])
    else:
        dllogger.init(backends=[])

    dllogger.log(data=vars(args), step='PARAMETER')

    if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir:
        print("Saving results to {}".format(args.checkpoint_dir))
        os.makedirs(args.checkpoint_dir, exist_ok=True)

    # sync workers before timing
    if args.distributed:
        torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0)
    torch.cuda.synchronize()

    main_start_time = time.time()

    if args.seed is not None:
        torch.manual_seed(args.seed)

    train_ratings = torch.load(args.data + '/train_ratings.pt',
                               map_location=torch.device('cuda:0'))
    test_ratings = torch.load(args.data + '/test_ratings.pt',
                              map_location=torch.device('cuda:0'))
    test_negs = torch.load(args.data + '/test_negatives.pt',
                           map_location=torch.device('cuda:0'))

    valid_negative = test_negs.shape[1]

    nb_maxs = torch.max(train_ratings, 0)[0]
    nb_users = nb_maxs[0].item() + 1
    nb_items = nb_maxs[1].item() + 1

    all_test_users = test_ratings.shape[0]

    test_users, test_items, dup_mask, real_indices = dataloading.create_test_data(
        test_ratings, test_negs, args)

    # make pytorch memory behavior more consistent later
    torch.cuda.empty_cache()

    # Create model
    model = NeuMF(nb_users,
                  nb_items,
                  mf_dim=args.factors,
                  mlp_layer_sizes=args.layers,
                  dropout=args.dropout)

    optimizer = FusedAdam(model.parameters(),
                          lr=args.learning_rate,
                          betas=(args.beta1, args.beta2),
                          eps=args.eps)

    criterion = nn.BCEWithLogitsLoss(
        reduction='none'
    )  # use torch.mean() with dim later to avoid copy to host
    # Move model and loss to GPU
    model = model.cuda()
    criterion = criterion.cuda()

    if args.amp:
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level="O2",
                                          keep_batchnorm_fp32=False,
                                          loss_scale='dynamic')

    # if args.distributed:
    #     model = DDP(model)

    local_batch = args.batch_size // args.world_size
    traced_criterion = torch.jit.trace(
        criterion.forward,
        (torch.rand(local_batch, 1), torch.rand(local_batch, 1)))

    if args.local_rank == 0:
        print(model)
        print("{} parameters".format(utils.count_parameters(model)))
        # [print(parameter) for parameter in model.parameters()]

    if args.load_checkpoint_path:
        state_dict = torch.load(args.load_checkpoint_path)
        state_dict = {
            k.replace('module.', ''): v
            for k, v in state_dict.items()
        }
        model.load_state_dict(state_dict)

    if args.mode == 'test':
        start = time.time()
        hr, ndcg = val_epoch(model,
                             test_users,
                             test_items,
                             dup_mask,
                             real_indices,
                             args.topk,
                             samples_per_user=valid_negative + 1,
                             num_user=all_test_users,
                             distributed=args.distributed)
        val_time = time.time() - start
        eval_size = all_test_users * (valid_negative + 1)
        eval_throughput = eval_size / val_time

        dllogger.log(step=tuple(),
                     data={
                         'best_eval_throughput': eval_throughput,
                         'hr@10': hr
                     })
        return

    max_hr = 0
    best_epoch = 0
    train_throughputs, eval_throughputs = [], []

    # broadcast model states from rank0 to other nodes !!! This is important!
    [torch.distributed.broadcast(p.data, src=0) for p in model.parameters()]
    # if args.local_rank == 0:
    #     save_initial_state_path = os.path.join(args.checkpoint_dir, 'model_init.pth')
    #     print("Saving the model to: ", save_initial_state_path)
    #     torch.save(model.state_dict(), save_initial_state_path)

    for epoch in range(args.epochs):

        begin = time.time()
        train_time = 0

        epoch_users, epoch_items, epoch_label = dataloading.prepare_epoch_train_data(
            train_ratings, nb_items, args)
        num_batches = len(epoch_users)
        for i in range(num_batches // args.grads_accumulated):
            batch_start = time.time()
            for j in range(args.grads_accumulated):
                batch_idx = (args.grads_accumulated * i) + j
                user = epoch_users[batch_idx]
                item = epoch_items[batch_idx]
                label = epoch_label[batch_idx].view(-1, 1)

                outputs = model(user, item)
                loss = traced_criterion(outputs, label).float()
                loss = torch.mean(loss.view(-1), 0)

                if args.amp:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

            # check grad sparsity
            if args.sparsity_check:
                total_nonzero = 0
                total_numel = 0
                for index, (name, p) in enumerate(model.named_parameters()):
                    sparsity = 1.0 - torch.sum(
                        p.grad.data.abs() > 0).float() / p.grad.data.numel()
                    total_nonzero += torch.sum(p.grad.data.abs() > 0).float()
                    total_numel += p.grad.data.numel()
                    if args.local_rank == 0:
                        wandb.log(
                            {
                                f"{name}(sparsity)(numel={p.grad.data.numel()})":
                                sparsity,
                            },
                            commit=False)
                if args.local_rank == 0:
                    wandb.log(
                        {
                            f"total_sparsity(numel={total_numel})":
                            1 - total_nonzero / total_numel,
                        },
                        commit=True)

            # add grace just before optimizer.step()
            torch.cuda.synchronize()
            comm_start = time.time()
            for index, (name, p) in enumerate(model.named_parameters()):
                new_grad = args.grc.step(p.grad.data, name)
                p.grad.data = new_grad
            torch.cuda.synchronize()
            timer['comm'] = time.time() - comm_start

            # [torch.distributed.all_reduce(p.grad.data) for p in model.parameters()]
            # for param in model.parameters():
            #     dist.all_reduce(param.grad.data)
            #     param.grad.data /= float(args.world_size)

            optimizer.step()
            for p in model.parameters():
                p.grad = None
            if args.throughput:
                torch.cuda.synchronize()

            if args.log_time and args.local_rank == 0:
                timer['batch_time'] = time.time() - batch_start
                timer['computation'] = timer['batch_time'] - timer['comm']
                print("Timer:", timer, '\n')

                timer['en/decoding'] = 0
                timer['batch_time'] = 0
                timer['computation'] = 0
                timer['comm'] = 0

            if args.log_volume and args.local_rank == 0:
                ratio = volume['compress'] / volume['nocompress']
                volume['ratio_acc'].append(ratio)
                avg_ratio = sum(volume['ratio_acc']) / len(volume['ratio_acc'])
                print(
                    f"Data volume:: compress {volume['compress']} no_compress {volume['nocompress']} ratio {ratio:.4f} avg_ratio {avg_ratio:.4f}"
                )
                volume['compress'] = 0
                volume['nocompress'] = 0

            batch_throughput = args.batch_size / (time.time() - batch_start
                                                  )  # global throughput
            train_time += time.time() - batch_start
            if (args.throughput
                    or args.eval_at_every_batch) and args.local_rank == 0:
                print(
                    f"Train :: Epoch [{epoch}/{args.epochs}] \t Batch [{i}/{num_batches}] \t "
                    f"Time {time.time()-batch_start:.5f} \t Throughput {batch_throughput:.2f}"
                )

            if args.throughput and i == 3:
                break
            if args.local_rank == 0:
                print(
                    f"Train :: Epoch [{epoch}/{args.epochs}] \t Batch [{i}/{num_batches}] \t "
                    f"Time {time.time()-batch_start:.5f} \t Throughput {batch_throughput:.2f}"
                )
            if args.eval_at_every_batch:
                hr, ndcg = val_epoch(model,
                                     test_users,
                                     test_items,
                                     dup_mask,
                                     real_indices,
                                     args.topk,
                                     samples_per_user=valid_negative + 1,
                                     num_user=all_test_users,
                                     epoch=epoch,
                                     distributed=args.distributed)
                if args.local_rank == 0:
                    wandb.log({
                        "eval/hr@10": hr,
                    })

        del epoch_users, epoch_items, epoch_label
        # train_time = time.time() - begin
        begin = time.time()

        epoch_samples = len(train_ratings) * (args.negative_samples + 1)
        train_throughput = epoch_samples / train_time
        if args.throughput:
            train_throughput = batch_throughput
        train_throughputs.append(train_throughput)

        hr, ndcg = val_epoch(model,
                             test_users,
                             test_items,
                             dup_mask,
                             real_indices,
                             args.topk,
                             samples_per_user=valid_negative + 1,
                             num_user=all_test_users,
                             epoch=epoch,
                             distributed=args.distributed)

        val_time = time.time() - begin

        eval_size = all_test_users * (valid_negative + 1)
        eval_throughput = eval_size / val_time
        eval_throughputs.append(eval_throughput)

        dllogger.log(step=(epoch, ),
                     data={
                         'train_throughput': train_throughput,
                         'hr@10': hr,
                         'train_epoch_time': train_time,
                         'validation_epoch_time': val_time,
                         'eval_throughput': eval_throughput
                     })

        if args.local_rank == 0:
            wandb.log(
                {
                    "train_epoch_time": train_time,
                    'validation_epoch_time': val_time,
                    'eval_throughput': eval_throughput,
                    'train_throughput': train_throughput,
                },
                commit=False)
            if not args.eval_at_every_batch:
                wandb.log({
                    "eval/hr@10": hr,
                }, commit=False)
            wandb.log({"epoch": epoch})

        if hr > max_hr and args.local_rank == 0:
            max_hr = hr
            best_epoch = epoch
            print("New best hr!")
            if args.checkpoint_dir:
                save_checkpoint_path = os.path.join(args.checkpoint_dir,
                                                    'model.pth')
                print("Saving the model to: ", save_checkpoint_path)
                torch.save(model.state_dict(), save_checkpoint_path)
            best_model_timestamp = time.time()

        if args.threshold is not None:
            if hr >= args.threshold:
                print("Hit threshold of {}".format(args.threshold))
                break

        if args.throughput:
            break

    if args.local_rank == 0:
        dllogger.log(data={
            'best_train_throughput':
            max(train_throughputs),
            'best_eval_throughput':
            max(eval_throughputs),
            'mean_train_throughput':
            np.mean(train_throughputs),
            'mean_eval_throughput':
            np.mean(eval_throughputs),
            'best_accuracy':
            max_hr,
            'best_epoch':
            best_epoch,
            'time_to_target':
            time.time() - main_start_time,
            'time_to_best_model':
            best_model_timestamp - main_start_time
        },
                     step=tuple())

        wandb.log({
            'best_train_throughput': max(train_throughputs),
            'best_eval_throughput': max(eval_throughputs),
            'mean_train_throughput': np.mean(train_throughputs),
            'mean_eval_throughput': np.mean(eval_throughputs),
            'best_accuracy': max_hr,
            'best_epoch': best_epoch,
            'time_to_target': time.time() - main_start_time,
            'time_to_best_model': best_model_timestamp - main_start_time
        })
Ejemplo n.º 4
0
def main():

    args = parse_args()
    init_distributed(args)
    print(args.desc)
    mlflow.start_run(run_name=args.desc)
    mlflow.log_param('batch_size', args.batch_size)
    mlflow.log_param('num_threads', args.threads)
    mlflow.log_param('num_of_epochs', args.epochs)

    torch.set_num_threads(args.threads)

    print(f"{vars(args)} step='PARAMETER'")

    if args.seed is not None:
        torch.manual_seed(args.seed)

    if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir:
        print("Saving results to {}".format(args.checkpoint_dir))
        os.makedirs(args.checkpoint_dir, exist_ok=True)

    # sync workers before timing
    if args.distributed:
        torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0)
    #torch.cuda.synchronize()

    main_start_time = time.time()

    train_ratings = torch.load(args.data + '/train_ratings.pt',
                               map_location=torch.device('cpu'))
    test_ratings = torch.load(args.data + '/test_ratings.pt',
                              map_location=torch.device('cpu'))
    test_negs = torch.load(args.data + '/test_negatives.pt',
                           map_location=torch.device('cpu'))

    valid_negative = test_negs.shape[1]

    nb_maxs = torch.max(train_ratings, 0)[0]
    nb_users = nb_maxs[0].item() + 1
    nb_items = nb_maxs[1].item() + 1

    all_test_users = test_ratings.shape[0]

    test_users, test_items, dup_mask, real_indices = dataloading.create_test_data(
        test_ratings, test_negs, args)

    # Create model
    model = NeuMF(nb_users,
                  nb_items,
                  mf_dim=args.factors,
                  mlp_layer_sizes=args.layers,
                  dropout=args.dropout)

    optimizer = Adam(model.parameters(),
                     lr=args.learning_rate,
                     betas=(args.beta1, args.beta2),
                     eps=args.eps)

    criterion = nn.BCEWithLogitsLoss(
        reduction='none'
    )  # use torch.mean() with dim later to avoid copy to host

    if args.amp:
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level="O2",
                                          keep_batchnorm_fp32=False,
                                          loss_scale='dynamic')

    local_batch = args.batch_size // args.world_size
    traced_criterion = torch.jit.trace(
        criterion.forward,
        (torch.rand(local_batch, 1), torch.rand(local_batch, 1)))

    print(model)
    print("{} parameters".format(utils.count_parameters(model)))

    if args.load_checkpoint_path:
        state_dict = torch.load(args.load_checkpoint_path)
        state_dict = {
            k.replace('module.', ''): v
            for k, v in state_dict.items()
        }
        model.load_state_dict(state_dict)

    if args.mode == 'test':
        start = time.time()
        hr, ndcg = val_epoch(model,
                             test_users,
                             test_items,
                             dup_mask,
                             real_indices,
                             args.topk,
                             samples_per_user=valid_negative + 1,
                             num_user=all_test_users,
                             distributed=args.distributed)
        val_time = time.time() - start
        eval_size = all_test_users * (valid_negative + 1)
        eval_throughput = eval_size / val_time

        log_data = data = {
            'best_eval_throughput': eval_throughput,
            'test_hr_at_10': hr
        }
        print(log_data)
        return

    max_hr = 0
    best_epoch = 0
    train_throughputs, eval_throughputs = [], []

    for epoch in range(args.epochs):

        begin = time.time()

        epoch_users, epoch_items, epoch_label = dataloading.prepare_epoch_train_data(
            train_ratings, nb_items, args)
        num_batches = len(epoch_users)
        for i in range(num_batches // args.grads_accumulated):
            for j in range(args.grads_accumulated):
                batch_idx = (args.grads_accumulated * i) + j
                user = epoch_users[batch_idx]
                item = epoch_items[batch_idx]
                label = epoch_label[batch_idx].view(-1, 1)

                outputs = model(user, item)
                loss = traced_criterion(outputs, label).float()
                loss = torch.mean(loss.view(-1), 0)

                if args.amp:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()
            optimizer.step()

            for p in model.parameters():
                p.grad = None

        del epoch_users, epoch_items, epoch_label
        train_time = time.time() - begin
        begin = time.time()

        epoch_samples = len(train_ratings) * (args.negative_samples + 1)
        train_throughput = epoch_samples / train_time
        train_throughputs.append(train_throughput)

        hr, ndcg = val_epoch(model,
                             test_users,
                             test_items,
                             dup_mask,
                             real_indices,
                             args.topk,
                             samples_per_user=valid_negative + 1,
                             num_user=all_test_users,
                             epoch=epoch,
                             distributed=args.distributed)

        val_time = time.time() - begin

        eval_size = all_test_users * (valid_negative + 1)
        eval_throughput = eval_size / val_time
        eval_throughputs.append(eval_throughput)

        log_data = {
            'train_throughput': train_throughput,
            'hr_at_10': hr,
            'train_epoch_time': train_time,
            'validation_epoch_time': val_time,
            'eval_throughput': eval_throughput
        }
        print(log_data)
        mlflow.log_metrics(log_data, epoch)

        if hr > max_hr and args.local_rank == 0:
            max_hr = hr
            best_epoch = epoch
            print("New best hr!")
            if args.checkpoint_dir:
                save_checkpoint_path = os.path.join(args.checkpoint_dir,
                                                    'model.pth')
                print("Saving the model to: ", save_checkpoint_path)
                torch.save(model.state_dict(), save_checkpoint_path)
            best_model_timestamp = time.time()

        if args.threshold is not None:
            if hr >= args.threshold:
                print("Hit threshold of {}".format(args.threshold))
                break

    if args.local_rank == 0:
        log_data = {
            'best_train_throughput': max(train_throughputs),
            'best_eval_throughput': max(eval_throughputs),
            'mean_train_throughput': np.mean(train_throughputs),
            'mean_eval_throughput': np.mean(eval_throughputs),
            'best_accuracy': max_hr,
            'best_epoch': best_epoch,
            'time_to_target': time.time() - main_start_time
        }
        print(log_data)
        mlflow.log_metrics(log_data)

    mlflow.end_run()
Ejemplo n.º 5
0
def main():
    args = parse_args()
    init_distributed(args)

    if args.local_rank == 0:
        dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                                           filename=args.log_path),
                                dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)])
    else:
        dllogger.init(backends=[])

    dllogger.log(data=vars(args), step='PARAMETER')

    if args.seed is not None:
        torch.manual_seed(args.seed)

    print("Saving results to {}".format(args.checkpoint_dir))
    if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir != '':
        os.makedirs(args.checkpoint_dir, exist_ok=True)

    # sync workers before timing
    if args.distributed:
        torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0)
    torch.cuda.synchronize()

    main_start_time = time.time()

    train_ratings = torch.load(args.data+'/train_ratings.pt', map_location=torch.device('cuda:{}'.format(args.local_rank)))
    test_ratings = torch.load(args.data+'/test_ratings.pt', map_location=torch.device('cuda:{}'.format(args.local_rank)))
    test_negs = torch.load(args.data+'/test_negatives.pt', map_location=torch.device('cuda:{}'.format(args.local_rank)))

    valid_negative = test_negs.shape[1]

    nb_maxs = torch.max(train_ratings, 0)[0]
    nb_users = nb_maxs[0].item() + 1
    nb_items = nb_maxs[1].item() + 1

    all_test_users = test_ratings.shape[0]

    test_users, test_items, dup_mask, real_indices = dataloading.create_test_data(test_ratings, test_negs, args)

    # make pytorch memory behavior more consistent later
    torch.cuda.empty_cache()

    # Create model
    model = NeuMF(nb_users, nb_items,
                  mf_dim=args.factors,
                  mlp_layer_sizes=args.layers,
                  dropout=args.dropout)

    optimizer = FusedAdam(model.parameters(), lr=args.learning_rate,
                          betas=(args.beta1, args.beta2), eps=args.eps)

    criterion = nn.BCEWithLogitsLoss(reduction='none') # use torch.mean() with dim later to avoid copy to host
    # Move model and loss to GPU
    model = model.cuda()
    criterion = criterion.cuda()

    local_batch = args.batch_size // args.world_size
    #traced_criterion = torch.jit.trace(criterion.forward,
    #                                   (torch.rand(local_batch,1),torch.rand(local_batch,1)))
    traced_criterion = criterion

    pyprof.init()
    #import importlib
    #pyprof.wrap(importlib.import_module(__name__), "traced_criterion")
    #pyprof.wrap(traced_criterion, "__call__")

    if args.opt_level == "O2":
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level,
                                          keep_batchnorm_fp32=False, loss_scale='dynamic')

    if args.distributed:
        model = DDP(model)

    print(model)
    print("{} parameters".format(utils.count_parameters(model)))

    if args.load_checkpoint_path:
        state_dict = torch.load(args.load_checkpoint_path)
        state_dict = {k.replace('module.', '') : v for k,v in state_dict.items()}
        model.load_state_dict(state_dict)

    if args.mode == 'test':
        start = time.time()
        hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk,
                             samples_per_user=valid_negative + 1,
                             num_user=all_test_users, distributed=args.distributed)
        val_time = time.time() - start
        eval_size = all_test_users * (valid_negative + 1)
        eval_throughput = eval_size / val_time

        dllogger.log(step=tuple(), data={'best_eval_throughput' : eval_throughput,
                                         'hr@10' : hr})
        return
    
    max_hr = 0
    best_epoch = 0
    train_throughputs, eval_throughputs = [], []

    with torch.autograd.profiler.emit_nvtx():

        for epoch in range(args.epochs):
        
            begin = time.time()
        
            epoch_users, epoch_items, epoch_label = dataloading.prepare_epoch_train_data(train_ratings, nb_items, args)
            num_batches = len(epoch_users)
            for i in range(num_batches // args.grads_accumulated):

                if i == 10:
                    profiler.start()

                for j in range(args.grads_accumulated):
                    batch_idx = (args.grads_accumulated * i) + j
                    user = epoch_users[batch_idx]
                    item = epoch_items[batch_idx]
                    label = epoch_label[batch_idx].view(-1,1)
        
                    outputs = model(user, item)
                    nvtx.range_push("layer:Loss")
                    loss = traced_criterion(outputs, label).float()
                    nvtx.range_pop()
                    nvtx.range_push("layer:Mean")
                    loss = torch.mean(loss.view(-1), 0)
                    nvtx.range_pop()
        
                    if args.opt_level == "O2":
                        with amp.scale_loss(loss, optimizer) as scaled_loss:
                            scaled_loss.backward()
                    else:
                        loss.backward()
                nvtx.range_push("layer:Adam")
                optimizer.step()
                nvtx.range_pop()

                if i == 10:
                    profiler.stop()
        
                for p in model.parameters():
                    p.grad = None
        
            del epoch_users, epoch_items, epoch_label
            train_time = time.time() - begin
            begin = time.time()
        
            epoch_samples = len(train_ratings) * (args.negative_samples + 1)
            train_throughput = epoch_samples / train_time
            train_throughputs.append(train_throughput)
        
            hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk,
                                 samples_per_user=valid_negative + 1,
                                 num_user=all_test_users, epoch=epoch, distributed=args.distributed)
        
            val_time = time.time() - begin
        
        
            eval_size = all_test_users * (valid_negative + 1)
            eval_throughput = eval_size / val_time
            eval_throughputs.append(eval_throughput)
        
            dllogger.log(step=(epoch,),
                         data = {'train_throughput': train_throughput,
                                 'hr@10': hr,
                                 'train_epoch_time': train_time,
                                 'validation_epoch_time': val_time,
                                 'eval_throughput': eval_throughput})
        
            if hr > max_hr and args.local_rank == 0:
                max_hr = hr
                best_epoch = epoch
                save_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.pth')
                print("New best hr! Saving the model to: ", save_checkpoint_path)
                torch.save(model.state_dict(), save_checkpoint_path)
                best_model_timestamp = time.time()
        
            if args.threshold is not None:
                if hr >= args.threshold:
                    print("Hit threshold of {}".format(args.threshold))
                    break

    if args.local_rank == 0:
        dllogger.log(data={'best_train_throughput': max(train_throughputs),
                           'best_eval_throughput': max(eval_throughputs),
                           'mean_train_throughput': np.mean(train_throughputs),
                           'mean_eval_throughput': np.mean(eval_throughputs),
                           'best_accuracy': max_hr,
                           'best_epoch': best_epoch,
                           'time_to_target': time.time() - main_start_time,
                           'time_to_best_model': best_model_timestamp - main_start_time},
                     step=tuple())