Beispiel #1
0
def main():
    # Note: The run start is in convert.py

    exp = Experiment(__file__)
    args = exp.get_arguments(parse_args(), show=True)
    device = exp.get_device()
    chrono = exp.chrono()

    # Save configuration to file
    config = {k: v for k, v in args.__dict__.items()}
    config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp())
    config['local_timestamp'] = str(datetime.now())

    tmp = os.environ['TEMP_DIRECTORY']
    run_dir = "{}/run/neumf/{}".format(tmp, config['timestamp'])

    print("Saving config and results to {}".format(run_dir))

    if run_dir != '':
        os.makedirs(run_dir, exist_ok=True)

    utils.save_config(config, run_dir)

    # Load Data
    # ------------------------------------------------------------------------------------------------------------------
    print('Loading data')
    with chrono.time('loading_data', skip_obs=0):
        t1 = time.time()

        train_dataset = CFTrainDataset(
            os.path.join(args.data, TRAIN_RATINGS_FILENAME),
            args.negative_samples)

        # mlperf_log.ncf_print(key=# mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size)
        # mlperf_log.ncf_print(key=# mlperf_log.INPUT_ORDER)  # set shuffle=True in DataLoader
        train_dataloader = torch.utils.data.DataLoader(
            dataset=train_dataset,
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=args.workers,
            pin_memory=True)

        nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items

        print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d' %
              (time.time() - t1, nb_users, nb_items, train_dataset.mat.nnz))
    # ------------------------------------------------------------------------------------------------------------------

    # Create model
    model = NeuMF(nb_users,
                  nb_items,
                  mf_dim=args.factors,
                  mf_reg=0.,
                  mlp_layer_sizes=args.layers,
                  mlp_layer_regs=[0. for i in args.layers]).to(device)
    print(model)
    print("{} parameters".format(utils.count_parameters(model)))

    # Save model text description
    with open(os.path.join(run_dir, 'model.txt'), 'w') as file:
        file.write(str(model))

    # Add optimizer and loss to graph
    # mlperf_log.ncf_print(key=# mlperf_log.OPT_LR, value=args.learning_rate)
    beta1, beta2, epsilon = 0.9, 0.999, 1e-8

    optimizer = torch.optim.Adam(model.parameters(),
                                 betas=(beta1, beta2),
                                 lr=args.learning_rate,
                                 eps=epsilon)

    # mlperf_log.ncf_print(key=# mlperf_log.MODEL_HP_LOSS_FN, value=# mlperf_log.BCE)
    criterion = nn.BCEWithLogitsLoss().to(device)

    model.train()

    for epoch in range(args.repeat):
        losses = utils.AverageMeter()

        with chrono.time('train') as t:

            for batch_index, (user, item,
                              label) in enumerate(train_dataloader):
                if batch_index > args.number:
                    break

                user = torch.autograd.Variable(user,
                                               requires_grad=False).to(device)
                item = torch.autograd.Variable(item,
                                               requires_grad=False).to(device)
                label = torch.autograd.Variable(label,
                                                requires_grad=False).to(device)

                outputs = model(user, item)
                loss = criterion(outputs, label)
                exp.log_batch_loss(loss.item())
                losses.update(loss.item(), user.size(0))

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        exp.log_epoch_loss(losses.sum)

        # Save stats to file
        exp.show_eta(epoch, t)

    exp.report()
Beispiel #2
0
def main():
    args = parse_args()
    if args.seed is not None:
        print("Using seed = {}".format(args.seed))
        torch.manual_seed(args.seed)
        np.random.seed(seed=args.seed)

    # Save configuration to file
    config = {k: v for k, v in args.__dict__.items()}
    config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp())
    config['local_timestamp'] = str(datetime.now())
    run_dir = "./run/neumf/{}".format(config['timestamp'])
    print("Saving config and results to {}".format(run_dir))
    if not os.path.exists(run_dir) and run_dir != '':
        os.makedirs(run_dir)
    utils.save_config(config, run_dir)

    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    t1 = time.time()
    # Load Data
    print('Loading data')
    train_dataset = CFTrainDataset(
        os.path.join(args.data, TRAIN_RATINGS_FILENAME), args.negative_samples)
    train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset,
                                                   batch_size=args.batch_size,
                                                   shuffle=True,
                                                   num_workers=args.workers,
                                                   pin_memory=True)

    test_ratings = load_test_ratings(
        os.path.join(args.data, TEST_RATINGS_FILENAME))  # noqa: E501
    test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME))
    nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items
    print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d' %
          (time.time() - t1, nb_users, nb_items, train_dataset.mat.nnz,
           len(test_ratings)))

    # Create model
    model = NeuMF(nb_users,
                  nb_items,
                  mf_dim=args.factors,
                  mf_reg=0.,
                  mlp_layer_sizes=args.layers,
                  mlp_layer_regs=[0. for i in args.layers])
    print(model)
    print("{} parameters".format(utils.count_parameters(model)))

    # Save model text description
    with open(os.path.join(run_dir, 'model.txt'), 'w') as file:
        file.write(str(model))

    # Add optimizer and loss to graph
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
    criterion = nn.BCEWithLogitsLoss()

    if use_cuda:
        # Move model and loss to GPU
        model = model.cuda()
        criterion = criterion.cuda()

    # Create files for tracking training
    valid_results_file = os.path.join(run_dir, 'valid_results.csv')

    # Calculate initial Hit Ratio and NDCG
    hits, ndcgs = val_epoch(model,
                            test_ratings,
                            test_negs,
                            args.topk,
                            use_cuda=use_cuda,
                            processes=args.processes)
    print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format(
        K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs)))

    for epoch in range(args.epochs):

        model.train()
        losses = utils.AverageMeter()

        begin = time.time()
        loader = tqdm.tqdm(train_dataloader)
        length = len(loader)
        if length < 101:
            print(
                'Exiting, cannot profile the required 100 iterations. Please re-run with a larger batch size.'
            )
            cuda.profile_stop()
            exit()
        for batch_index, (user, item, label) in enumerate(loader):
            if batch_index == length // 2 and epoch == 0:
                print('Starting profiling for 100 iterations.')
                cuda.profile_start()

            if batch_index == length // 2 + 100 and epoch == 0:
                print(
                    'Profiling completed, stopping profiling and continuing training.'
                )
                cuda.profile_stop()

            user = torch.autograd.Variable(user, requires_grad=False)
            item = torch.autograd.Variable(item, requires_grad=False)
            label = torch.autograd.Variable(label, requires_grad=False)
            if use_cuda:
                user = user.cuda(async=True)
                item = item.cuda(async=True)
                label = label.cuda(async=True)

            outputs = model(user, item)
            loss = criterion(outputs, label)
            losses.update(loss.data.item(), user.size(0))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Save stats to file
            description = (
                'Epoch {} Loss {loss.val:.4f} ({loss.avg:.4f})'.format(
                    epoch, loss=losses))
            loader.set_description(description)

        train_time = time.time() - begin
        begin = time.time()
        hits, ndcgs = val_epoch(model,
                                test_ratings,
                                test_negs,
                                args.topk,
                                use_cuda=use_cuda,
                                output=valid_results_file,
                                epoch=epoch,
                                processes=args.processes)
        val_time = time.time() - begin
        print(
            'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},'
            ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format(
                epoch=epoch,
                K=args.topk,
                hit_rate=np.mean(hits),
                ndcg=np.mean(ndcgs),
                train_time=train_time,
                val_time=val_time))

        if args.threshold is not None:
            if np.mean(hits) >= args.threshold:
                print("Hit threshold of {}".format(args.threshold))
                return 0
Beispiel #3
0
def main():
    # Note: The run start is in convert.py

    args = parse_args()
    if args.seed is not None:
        print("Using seed = {}".format(args.seed))
        torch.manual_seed(args.seed)
        np.random.seed(seed=args.seed)

    # Save configuration to file
    config = {k: v for k, v in args.__dict__.items()}
    config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp())
    config['local_timestamp'] = str(datetime.now())
    run_dir = "./run/neumf/{}".format(config['timestamp'])
    print("Saving config and results to {}".format(run_dir))
    if not os.path.exists(run_dir) and run_dir != '':
        os.makedirs(run_dir)
    utils.save_config(config, run_dir)

    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    t1 = time.time()
    # Load Data
    print('Loading data')
    train_dataset = CFTrainDataset(
        os.path.join(args.data, TRAIN_RATINGS_FILENAME), args.negative_samples)

    mlperf_log.ncf_print(key=mlperf_log.INPUT_BATCH_SIZE,
                         value=args.batch_size)
    mlperf_log.ncf_print(
        key=mlperf_log.INPUT_ORDER)  # set shuffle=True in DataLoader
    train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset,
                                                   batch_size=args.batch_size,
                                                   shuffle=True,
                                                   num_workers=args.workers,
                                                   pin_memory=True)
    test_ratings = load_test_ratings(
        os.path.join(args.data, TEST_RATINGS_FILENAME))  # noqa: E501
    test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME))
    nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items
    print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d' %
          (time.time() - t1, nb_users, nb_items, train_dataset.mat.nnz,
           len(test_ratings)))

    # Create model
    model = NeuMF(nb_users,
                  nb_items,
                  mf_dim=args.factors,
                  mf_reg=0.,
                  mlp_layer_sizes=args.layers,
                  mlp_layer_regs=[0. for i in args.layers])
    print(model)
    print("{} parameters".format(utils.count_parameters(model)))

    # Save model text description
    with open(os.path.join(run_dir, 'model.txt'), 'w') as file:
        file.write(str(model))

    # Add optimizer and loss to graph
    mlperf_log.ncf_print(key=mlperf_log.TRAIN_LEARN_RATE,
                         value=args.learning_rate)
    beta1, beta2, epsilon = 0.9, 0.999, 1e-8
    mlperf_log.ncf_print(key=mlperf_log.OPT_NAME, value="Adam")
    mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=beta1)
    mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=beta2)
    mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=epsilon)
    optimizer = torch.optim.Adam(model.parameters(),
                                 betas=(beta1, beta2),
                                 lr=args.learning_rate,
                                 eps=epsilon)

    mlperf_log.ncf_print(key=mlperf_log.MODEL_HP_LOSS_FN, value=mlperf_log.BCE)
    criterion = nn.BCEWithLogitsLoss()

    if use_cuda:
        # Move model and loss to GPU
        model = model.cuda()
        criterion = criterion.cuda()

    # Create files for tracking training
    valid_results_file = os.path.join(run_dir, 'valid_results.csv')

    # Calculate initial Hit Ratio and NDCG
    hits, ndcgs = val_epoch(model,
                            test_ratings,
                            test_negs,
                            args.topk,
                            use_cuda=use_cuda,
                            processes=args.processes)
    print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format(
        K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs)))

    success = False
    mlperf_log.ncf_print(key=mlperf_log.TRAIN_LOOP)
    for epoch in range(args.epochs):
        mlperf_log.ncf_print(key=mlperf_log.TRAIN_EPOCH, value=epoch)
        model.train()
        losses = utils.AverageMeter()

        mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_NUM_NEG,
                             value=train_dataset.nb_neg)
        mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_TRAIN_NEG_GEN)
        begin = time.time()
        loader = tqdm.tqdm(train_dataloader)
        for batch_index, (user, item, label) in enumerate(loader):
            user = torch.autograd.Variable(user, requires_grad=False)
            item = torch.autograd.Variable(item, requires_grad=False)
            label = torch.autograd.Variable(label, requires_grad=False)
            if use_cuda:
                user = user.cuda(async=True)
                item = item.cuda(async=True)
                label = label.cuda(async=True)

            outputs = model(user, item)
            loss = criterion(outputs, label)
            losses.update(loss.data.item(), user.size(0))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Save stats to file
            description = (
                'Epoch {} Loss {loss.val:.4f} ({loss.avg:.4f})'.format(
                    epoch, loss=losses))
            loader.set_description(description)

        train_time = time.time() - begin
        begin = time.time()
        hits, ndcgs = val_epoch(model,
                                test_ratings,
                                test_negs,
                                args.topk,
                                use_cuda=use_cuda,
                                output=valid_results_file,
                                epoch=epoch,
                                processes=args.processes)
        mlperf_log.ncf_print(key=mlperf_log.EVAL_ACCURACY,
                             value={
                                 "epoch": epoch,
                                 "value": float(np.mean(hits))
                             })
        mlperf_log.ncf_print(key=mlperf_log.EVAL_TARGET, value=args.threshold)
        mlperf_log.ncf_print(key=mlperf_log.EVAL_STOP)
        val_time = time.time() - begin
        print(
            'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},'
            ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format(
                epoch=epoch,
                K=args.topk,
                hit_rate=np.mean(hits),
                ndcg=np.mean(ndcgs),
                train_time=train_time,
                val_time=val_time))
        if args.threshold is not None:
            if np.mean(hits) >= args.threshold:
                print("Hit threshold of {}".format(args.threshold))
                success = True
                break

    mlperf_log.ncf_print(key=mlperf_log.RUN_STOP, value={"success": success})
    mlperf_log.ncf_print(key=mlperf_log.RUN_FINAL)
Beispiel #4
0
from dataset import CFTrainDataset, load_test_ratings, load_test_negs
from convert import (TEST_NEG_FILENAME, TEST_RATINGS_FILENAME,
                     TRAIN_RATINGS_FILENAME)


def parse_args():
    parser = ArgumentParser(description="Load a Nerual Collaborative"
                            " Filtering model")
    parser.add_argument('--path', type=str, help='Path to pretrained model')
    return parser.parse_args()


args = parse_args()

print('Loading data')
train_dataset = CFTrainDataset(os.path.join('ml-20m', TRAIN_RATINGS_FILENAME),
                               4)
train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=2048,
                                               shuffle=True,
                                               num_workers=0,
                                               pin_memory=True)

test_ratings = load_test_ratings(os.path.join(
    'ml-20m', TEST_RATINGS_FILENAME))  # noqa: E501
test_negs = load_test_negs(os.path.join('ml-20m', TEST_NEG_FILENAME))
nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items

# Create model
layers = [256, 256, 128, 64]
model = NeuMF(nb_users,
              nb_items,
Beispiel #5
0
def main():
    # Note: The run start is in data_preprocess.py

    args = parse_args()
    if args.seed is not None:
        print("Using seed = {}".format(args.seed))
        torch.manual_seed(args.seed)
        np.random.seed(seed=args.seed)

    # Save configuration to file
    config = {k: v for k, v in args.__dict__.items()}
    config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp())
    config['local_timestamp'] = str(datetime.now())
    run_dir = "./run/MGPM/{}/{}".format(
        os.path.basename(os.path.normpath(args.data)), config['timestamp'])
    print("Saving config and results to {}".format(run_dir))
    if not os.path.exists(run_dir) and run_dir != '':
        os.makedirs(run_dir)
    utils.save_config(config, run_dir)

    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    if use_cuda:
        print("Using cuda ...")
    else:
        print("Using CPU ...")

    t1 = time.time()

    best_hit, best_ndcg = 0., 0.
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch

    # Load Data
    print('Loading data')
    print(os.path.join(args.data, TRAIN_RATINGS_FILENAME))
    train_dataset = CFTrainDataset(
        os.path.join(args.data, TRAIN_RATINGS_FILENAME),
        os.path.join(args.data, DATA_SUMMARY_FILENAME), args.negative_samples)

    mlperf_log.ncf_print(key=mlperf_log.INPUT_BATCH_SIZE,
                         value=args.batch_size)
    mlperf_log.ncf_print(
        key=mlperf_log.INPUT_ORDER)  # set shuffle=True in DataLoader
    train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset,
                                                   batch_size=args.batch_size,
                                                   shuffle=True,
                                                   num_workers=args.workers,
                                                   pin_memory=True)
    test_ratings = load_test_ratings(
        os.path.join(args.data, TEST_RATINGS_FILENAME))  # noqa: E501
    test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME))
    nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items
    print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d' %
          (time.time() - t1, nb_users, nb_items, train_dataset.mat.nnz,
           len(test_ratings)))

    # Create model
    model = Multi_Preference_Model(nb_users=nb_users,
                                   nb_items=nb_items,
                                   embed_dim=32,
                                   history_size=9)
    print(model)
    print("{} parameters".format(utils.count_parameters(model)))

    # Save model text description
    with open(os.path.join(run_dir, 'model.txt'), 'w') as file:
        file.write(str(model))

    # Add optimizer and loss to graph
    mlperf_log.ncf_print(key=mlperf_log.OPT_LR, value=args.learning_rate)
    beta1, beta2, epsilon = 0.9, 0.999, 1e-8
    mlperf_log.ncf_print(key=mlperf_log.OPT_NAME, value="Adam")
    mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=beta1)
    mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=beta2)
    mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=epsilon)
    optimizer = torch.optim.Adam(model.parameters(),
                                 betas=(beta1, beta2),
                                 lr=args.learning_rate,
                                 eps=epsilon)

    mlperf_log.ncf_print(key=mlperf_log.MODEL_HP_LOSS_FN, value=mlperf_log.BCE)
    # optimizer = torch.optim.SGD(model.parameters(),lr=args.learning_rate,momentum=0.9)
    criterion = nn.BCEWithLogitsLoss()

    if use_cuda:
        # Move model and loss to GPU
        model = model.cuda()
        criterion = criterion.cuda()

    if args.resume:
        # Load checkpoint.
        print('==> Resuming from checkpoint..')
        assert os.path.isdir(
            'checkpoint'), 'Error: no checkpoint directory found!'
        checkpoint = torch.load('./checkpoint/' + model._get_name() + '.pd')
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch']
        best_hit = checkpoint['hit']
        best_ndcg = checkpoint['ndcg']

    # Create files for tracking training
    valid_results_file = os.path.join(run_dir, 'valid_results.csv')

    # Calculate initial Hit Ratio and NDCG
    if start_epoch == 0:
        hits, ndcgs = val_epoch(model,
                                test_ratings,
                                test_negs,
                                args.topk,
                                use_cuda=use_cuda,
                                processes=args.processes)
        print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format(
            K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs)))

    mlperf_log.ncf_print(key=mlperf_log.TRAIN_LOOP)
    for epoch in range(start_epoch, args.epochs):
        mlperf_log.ncf_print(key=mlperf_log.TRAIN_EPOCH, value=epoch)
        model.train()
        losses = utils.AverageMeter()

        mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_NUM_NEG,
                             value=train_dataset.nb_neg)
        mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_TRAIN_NEG_GEN)
        begin = time.time()
        loader = tqdm.tqdm(train_dataloader)
        for batch_index, (user, item, history, label) in enumerate(loader):
            user = torch.autograd.Variable(user, requires_grad=False)
            item = torch.autograd.Variable(item, requires_grad=False)
            history = torch.autograd.Variable(history, requires_grad=False)
            label = torch.autograd.Variable(label, requires_grad=False)
            if use_cuda:
                user = user.cuda()
                item = item.cuda()
                history = history.cuda()
                label = label.cuda()

            # outputs, _ = model(user, item,history)
            outputs = model(user, item, history)
            loss = criterion(outputs, label)
            losses.update(loss.data.item(), user.size(0))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Save stats to file
            description = (
                'Epoch {} Loss {loss.val:.4f} ({loss.avg:.4f})'.format(
                    epoch, loss=losses))
            loader.set_description(description)

        train_time = time.time() - begin
        begin = time.time()
        hits, ndcgs = val_epoch(model,
                                test_ratings,
                                test_negs,
                                args.topk,
                                use_cuda=use_cuda,
                                output=valid_results_file,
                                epoch=epoch,
                                processes=args.processes)
        mlperf_log.ncf_print(key=mlperf_log.EVAL_ACCURACY,
                             value={
                                 "epoch": epoch,
                                 "value": float(np.mean(hits))
                             })
        mlperf_log.ncf_print(key=mlperf_log.EVAL_STOP)
        val_time = time.time() - begin
        print(
            'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},'
            ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format(
                epoch=epoch,
                K=args.topk,
                hit_rate=np.mean(hits),
                ndcg=np.mean(ndcgs),
                train_time=train_time,
                val_time=val_time))
        if np.mean(hits) >= best_hit or np.mean(ndcgs) >= best_ndcg:
            best_hit = np.mean(hits)
            best_ndcg = np.mean(ndcgs)
            # Save checkpoint.
            print('Saving checkpoint..')
            state = {
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'hit': best_hit,
                'ndcg': best_ndcg,
            }
            if not os.path.isdir('checkpoint'):
                os.mkdir('checkpoint')
            torch.save(state, './checkpoint/' + model._get_name() + '.pd')

    print("Best hit: ", best_hit)
    print("Best_ndcg: ", best_ndcg)

    mlperf_log.ncf_print(key=mlperf_log.RUN_STOP)
    mlperf_log.ncf_print(key=mlperf_log.RUN_FINAL)
Beispiel #6
0
def main():
    args = parse_args()
    if args.seed is not None:
        print("Using seed = {}".format(args.seed))
        torch.manual_seed(args.seed)
        np.random.seed(seed=args.seed)

    # Save configuration to file
    config = {k: v for k, v in args.__dict__.items()}
    config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp())
    config['local_timestamp'] = str(datetime.now())
    run_dir = "./run/neumf/{}".format(config['timestamp'])
    print("Saving config and results to {}".format(run_dir))
    if not os.path.exists(run_dir) and run_dir != '':
        os.makedirs(run_dir)
    utils.save_config(config, run_dir)

    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    t1 = time.time()
    # Load Data
    # TODO: Reading CSVs is slow. Could use HDF or Apache Arrow
    train_dataset = CFTrainDataset(
        os.path.join(args.data, TRAIN_RATINGS_FILENAME), args.negative_samples)
    print('batchsize=%d' % args.batch_size)
    train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset,
                                                   batch_size=args.batch_size,
                                                   shuffle=True,
                                                   num_workers=8,
                                                   pin_memory=True)
    test_ratings = load_test_ratings(
        os.path.join(args.data, TEST_RATINGS_FILENAME))  # noqa: E501
    test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME))
    nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items
    print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d' %
          (time.time() - t1, nb_users, nb_items, train_dataset.mat.nnz,
           len(test_ratings)))

    # Create model
    model = NeuMF(nb_users,
                  nb_items,
                  mf_dim=args.factors,
                  mf_reg=0.,
                  mlp_layer_sizes=args.layers,
                  mlp_layer_regs=[0. for i in args.layers])
    print(model)
    print("{} parameters".format(utils.count_parameters(model)))

    # Save model text description
    with open(os.path.join(run_dir, 'model.txt'), 'w') as file:
        file.write(str(model))

    # Add optimizer and loss to graph
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
    criterion = nn.BCEWithLogitsLoss()

    if use_cuda:
        # Move model and loss to GPU
        model = model.cuda()
        criterion = criterion.cuda()

    # Create files for tracking training
    valid_results_file = os.path.join(run_dir, 'valid_results.csv')

    # Calculate initial Hit Ratio and NDCG
    hits, ndcgs = val_epoch(model,
                            test_ratings,
                            test_negs,
                            args.topk,
                            use_cuda=use_cuda)
    print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format(
        K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs)))
    for epoch in range(args.epochs):
        model.train()
        losses = utils.AverageMeter()

        begin = time.time()
        loader = tqdm.tqdm(train_dataloader)
        counting_data = 0
        counting_forward = 0
        counting_zerograd = 0
        counting_backward = 0
        counting_updateweight = 0
        counting_des = 0
        for batch_index, (user, item, label) in enumerate(loader):
            start0 = time.time()
            user = torch.autograd.Variable(user, requires_grad=False)
            item = torch.autograd.Variable(item, requires_grad=False)
            label = torch.autograd.Variable(label, requires_grad=False)
            if use_cuda:
                user = user.cuda(async=True)
                item = item.cuda(async=True)
                label = label.cuda(async=True)

            start1 = time.time()
            outputs = model(user, item)
            loss = criterion(outputs, label)
            losses.update(loss.data.item(), user.size(0))
            start2 = time.time()

            optimizer.zero_grad()
            start3 = time.time()
            loss.backward()
            start4 = time.time()
            optimizer.step()
            start5 = time.time()

            # Save stats to file
            description = (
                'Epoch {} Loss {loss.val:.4f} ({loss.avg:.4f})'.format(
                    epoch, loss=losses))
            loader.set_description(description)

            start6 = time.time()

            counting_data += start1 - start0
            counting_forward += start2 - start1
            counting_zerograd += start3 - start2
            counting_backward += start4 - start3
            counting_updateweight += start5 - start3
            counting_des += start6 - start5

        train_time = time.time() - begin
        begin = time.time()
        hits, ndcgs = val_epoch(model,
                                test_ratings,
                                test_negs,
                                args.topk,
                                use_cuda=use_cuda,
                                output=valid_results_file,
                                epoch=epoch)
        val_time = time.time() - begin
        print(
            'data: {data:.f4}, forward: {ft:.4f}, zerograd: {zg:.4f}, backward: {bw:.4f},'
            ' adam: {adam:.4f}, description: {des:.4f}'.format(
                data=counting_data,
                ft=counting_forward,
                zg=counting_zerograd,
                bw=counting_backward,
                adam=counting_updateweight,
                des=counting_des))
        print(
            'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},'
            ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format(
                epoch=epoch,
                K=args.topk,
                hit_rate=np.mean(hits),
                ndcg=np.mean(ndcgs),
                train_time=train_time,
                val_time=val_time))
        if args.threshold is not None:
            if np.mean(hits) >= args.threshold:
                print("Hit threshold of {}".format(args.threshold))
                return 0
Beispiel #7
0
def main():
    args = parse_args()
    if args.seed is not None:
        print("Using seed = {}".format(args.seed))
        # torch.manual_seed(args.seed)
        mx.random.seed(seed_state=args.seed)
        np.random.seed(seed=args.seed)

    # Save configuration to file
    config = {k: v for k, v in args.__dict__.items()}
    config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp())
    config['local_timestamp'] = str(datetime.now())
    run_dir = "./run/neumf_" + args.data + "/{}".format(config['timestamp'])
    print("Saving config and results to {}".format(run_dir))
    if not os.path.exists(run_dir) and run_dir != '':
        os.makedirs(run_dir)
    utils.save_config(config, run_dir)  #defined in utils.py

    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and mx.test_utils.list_gpus()

    t1 = time.time()
    # Load Data
    print('Loading data')
    train_dataset = CFTrainDataset(
        os.path.join(args.data, TRAIN_RATINGS_FILENAME), args.negative_samples)
    #in original file, use 8 core as defaul

    # the parameter:shuffle means random the samples
    train_dataloader = mx.gluon.data.DataLoader(dataset=train_dataset,
                                                batch_size=args.batch_size,
                                                shuffle=True,
                                                num_workers=args.workers)

    test_ratings = load_test_ratings(
        os.path.join(args.data, TEST_RATINGS_FILENAME))  # noqa: E501
    test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME))
    nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items

    print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d' %
          (time.time() - t1, nb_users, nb_items, train_dataset.mat.nnz,
           len(test_ratings)))

    if (use_cuda):
        ctx = mx.gpu(0)
        # default to use NO.1 gpu can use docker to select a nvidia
    else:
        ctx = mx.cpu(0)

    # Create model
    model = NeuMF(nb_users,
                  nb_items,
                  mf_dim=args.factors,
                  mf_reg=0.,
                  mlp_layer_sizes=args.layers,
                  mlp_layer_regs=[0. for i in args.layers],
                  ctx=ctx)
    model.initialize(ctx=ctx)
    model.hybridize()
    print(model)
    # todo 9: to change the function in utils
    # print("{} parameters".format(utils.count_parameters(model)))

    # model.collect_params()
    # Save model text description
    with open(os.path.join(run_dir, 'model.txt'), 'w') as file:
        file.write(str(model))
    # model.save_parameters(os.path.join("/home/net.params", 'net.params'))

    # Create files for tracking training
    valid_results_file = os.path.join(run_dir, 'valid_results.csv')

    # Calculate initial Hit Ratio and NDCG
    hits, ndcgs = val_epoch(model,
                            test_ratings,
                            test_negs,
                            args.topk,
                            processes=args.processes,
                            ctx=ctx)
    print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format(
        K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs)))

    ############# hyperparameters
    # Add optimizer and loss to graph
    lr = args.learning_rate
    bs = args.batch_size

    trainer = mx.gluon.Trainer(model.collect_params(), 'adam',
                               {'learning_rate': lr})
    mxnet_criterion = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(
    )  # equivalent to lossfunction

    # training
    for epoch in range(args.epochs):
        begin = time.time()
        # tqdm shows the percentage of the process
        loader = tqdm.tqdm(train_dataloader)
        for batch_index, (user, item, label) in enumerate(loader):
            # TODO 7: search the autograd in mxnet
            # todo : let user act in gpu
            user = nd.array(user, ctx=ctx)
            item = nd.array(item, ctx=ctx)
            label = nd.array(label, ctx=ctx)

            # compute the gradient automatically
            with autograd.record():
                outputs = model(user, item)
                loss = mxnet_criterion(outputs, label.T)

            loss.backward()
            trainer.step(bs)

            for x in loss.mean().asnumpy().tolist():
                loss_number = x
            description = ('Epoch {}  Loss {:.4f}'.format(epoch, loss_number))
            loader.set_description(description)

        train_time = time.time() - begin
        begin = time.time()
        hits, ndcgs = val_epoch(model,
                                test_ratings,
                                test_negs,
                                args.topk,
                                output=valid_results_file,
                                epoch=epoch,
                                processes=args.processes,
                                ctx=ctx)
        val_time = time.time() - begin
        print(
            'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},'
            ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format(
                epoch=epoch,
                K=args.topk,
                hit_rate=np.mean(hits),
                ndcg=np.mean(ndcgs),
                train_time=train_time,
                val_time=val_time))
        if args.threshold is not None:
            if np.mean(hits) >= args.threshold:
                print("Hit threshold of {}".format(args.threshold))
                # Save model text description after modelling
                with open(os.path.join(run_dir, 'model.txt'), 'w') as file:
                    file.write(str(model))
                # model.save_parameters(os.path.join("/home/net.params",'net.params'))
                return 0
Beispiel #8
0
def main():
    global msglogger

    script_dir = os.path.dirname(__file__)

    args = parse_args()

    # Distiller loggers
    msglogger = apputils.config_pylogger('logging.conf',
                                         args.name,
                                         output_dir=args.output_dir)
    tflogger = TensorBoardLogger(msglogger.logdir)
    # tflogger.log_gradients = True
    # pylogger = PythonLogger(msglogger)

    if args.seed is not None:
        msglogger.info("Using seed = {}".format(args.seed))
        torch.manual_seed(args.seed)
        np.random.seed(seed=args.seed)

    args.qe_mode = str(args.qe_mode).split('.')[1]
    args.qe_clip_acts = str(args.qe_clip_acts).split('.')[1]

    apputils.log_execution_env_state(sys.argv)

    if args.gpus is not None:
        try:
            args.gpus = [int(s) for s in args.gpus.split(',')]
        except ValueError:
            msglogger.error(
                'ERROR: Argument --gpus must be a comma-separated list of integers only'
            )
            exit(1)
        if len(args.gpus) > 1:
            msglogger.error('ERROR: Only single GPU supported for NCF')
            exit(1)
        available_gpus = torch.cuda.device_count()
        for dev_id in args.gpus:
            if dev_id >= available_gpus:
                msglogger.error(
                    'ERROR: GPU device ID {0} requested, but only {1} devices available'
                    .format(dev_id, available_gpus))
                exit(1)
        # Set default device in case the first one on the list != 0
        torch.cuda.set_device(args.gpus[0])

    # Save configuration to file
    config = {k: v for k, v in args.__dict__.items()}
    config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp())
    config['local_timestamp'] = str(datetime.now())
    run_dir = msglogger.logdir
    msglogger.info("Saving config and results to {}".format(run_dir))
    if not os.path.exists(run_dir) and run_dir != '':
        os.makedirs(run_dir)
    utils.save_config(config, run_dir)

    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    t1 = time.time()
    # Load Data
    training = not (args.eval or args.qe_calibration
                    or args.activation_histograms)
    msglogger.info('Loading data')
    if training:
        train_dataset = CFTrainDataset(
            os.path.join(args.data, TRAIN_RATINGS_FILENAME),
            args.negative_samples)
        train_dataloader = torch.utils.data.DataLoader(
            dataset=train_dataset,
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=args.workers,
            pin_memory=True)
        nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items
    else:
        train_dataset = None
        train_dataloader = None
        nb_users, nb_items = (138493, 26744)

    test_ratings = load_test_ratings(
        os.path.join(args.data, TEST_RATINGS_FILENAME))  # noqa: E501
    test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME))

    msglogger.info(
        'Load data done [%.1f s]. #user=%d, #item=%d, #train=%s, #test=%d' %
        (time.time() - t1, nb_users, nb_items,
         str(train_dataset.mat.nnz) if training else 'N/A', len(test_ratings)))

    # Create model
    model = NeuMF(nb_users,
                  nb_items,
                  mf_dim=args.factors,
                  mf_reg=0.,
                  mlp_layer_sizes=args.layers,
                  mlp_layer_regs=[0. for i in args.layers],
                  split_final=args.split_final)
    if use_cuda:
        model = model.cuda()
    msglogger.info(model)
    msglogger.info("{} parameters".format(utils.count_parameters(model)))

    # Save model text description
    with open(os.path.join(run_dir, 'model.txt'), 'w') as file:
        file.write(str(model))

    compression_scheduler = None
    start_epoch = 0
    optimizer = None
    if args.load:
        if training:
            model, compression_scheduler, optimizer, start_epoch = apputils.load_checkpoint(
                model, args.load)
            if args.reset_optimizer:
                start_epoch = 0
                optimizer = None
        else:
            model = apputils.load_lean_checkpoint(model, args.load)

    # Add loss to graph
    criterion = nn.BCEWithLogitsLoss()

    if use_cuda:
        criterion = criterion.cuda()

    if training and optimizer is None:
        optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
        msglogger.info('Optimizer Type: %s', type(optimizer))
        msglogger.info('Optimizer Args: %s', optimizer.defaults)

    if args.compress:
        compression_scheduler = distiller.file_config(model, optimizer,
                                                      args.compress)
        model.cuda()

    # Create files for tracking training
    valid_results_file = os.path.join(run_dir, 'valid_results.csv')

    if args.qe_calibration or args.activation_histograms:
        calib = {
            'portion':
            args.qe_calibration,
            'desc_str':
            'quantization calibration stats',
            'collect_func':
            partial(distiller.data_loggers.collect_quant_stats,
                    inplace_runtime_check=True,
                    disable_inplace_attrs=True)
        }
        hists = {
            'portion':
            args.activation_histograms,
            'desc_str':
            'activation histograms',
            'collect_func':
            partial(distiller.data_loggers.collect_histograms,
                    activation_stats=None,
                    nbins=2048,
                    save_hist_imgs=True)
        }
        d = calib if args.qe_calibration else hists

        distiller.utils.assign_layer_fq_names(model)
        num_users = int(np.floor(len(test_ratings) * d['portion']))
        msglogger.info(
            "Generating {} based on {:.1%} of the test-set ({} users)".format(
                d['desc_str'], d['portion'], num_users))

        test_fn = partial(val_epoch,
                          ratings=test_ratings,
                          negs=test_negs,
                          K=args.topk,
                          use_cuda=use_cuda,
                          processes=args.processes,
                          num_users=num_users)
        d['collect_func'](model=model,
                          test_fn=test_fn,
                          save_dir=run_dir,
                          classes=None)

        return 0

    if args.eval:
        if args.quantize_eval and args.qe_calibration is None:
            model.cpu()
            quantizer = quantization.PostTrainLinearQuantizer.from_args(
                model, args)
            dummy_input = (torch.tensor([1]), torch.tensor([1]),
                           torch.tensor([True], dtype=torch.bool))
            quantizer.prepare_model(dummy_input)
            model.cuda()

        distiller.utils.assign_layer_fq_names(model)

        if args.eval_fp16:
            model = model.half()

        # Calculate initial Hit Ratio and NDCG
        begin = time.time()
        hits, ndcgs = val_epoch(model,
                                test_ratings,
                                test_negs,
                                args.topk,
                                use_cuda=use_cuda,
                                processes=args.processes)
        val_time = time.time() - begin
        hit_rate = np.mean(hits)
        msglogger.info(
            'Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}, val_time = {val_time:.2f}'
            .format(K=args.topk,
                    hit_rate=hit_rate,
                    ndcg=np.mean(ndcgs),
                    val_time=val_time))
        hit_rate = 0

        if args.quantize_eval:
            checkpoint_name = 'quantized'
            apputils.save_checkpoint(0,
                                     'NCF',
                                     model,
                                     optimizer=None,
                                     extras={'quantized_hr@10': hit_rate},
                                     name='_'.join([args.name, 'quantized'])
                                     if args.name else checkpoint_name,
                                     dir=msglogger.logdir)
        return 0

    total_samples = len(train_dataloader.sampler)
    steps_per_epoch = math.ceil(total_samples / args.batch_size)
    best_hit_rate = 0
    best_epoch = 0
    for epoch in range(start_epoch, args.epochs):
        msglogger.info('')
        model.train()
        losses = utils.AverageMeter()

        begin = time.time()

        if compression_scheduler:
            compression_scheduler.on_epoch_begin(epoch, optimizer)

        loader = tqdm.tqdm(train_dataloader)
        for batch_index, (user, item, label) in enumerate(loader):
            user = torch.autograd.Variable(user, requires_grad=False)
            item = torch.autograd.Variable(item, requires_grad=False)
            label = torch.autograd.Variable(label, requires_grad=False)
            if use_cuda:
                user = user.cuda(async=True)
                item = item.cuda(async=True)
                label = label.cuda(async=True)

            if compression_scheduler:
                compression_scheduler.on_minibatch_begin(
                    epoch, batch_index, steps_per_epoch, optimizer)

            outputs = model(user, item, torch.tensor([False],
                                                     dtype=torch.bool))
            loss = criterion(outputs, label)

            if compression_scheduler:
                compression_scheduler.before_backward_pass(
                    epoch,
                    batch_index,
                    steps_per_epoch,
                    loss,
                    optimizer,
                    return_loss_components=False)

            losses.update(loss.data.item(), user.size(0))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if compression_scheduler:
                compression_scheduler.on_minibatch_end(epoch, batch_index,
                                                       steps_per_epoch,
                                                       optimizer)

            # Save stats to file
            description = (
                'Epoch {} Loss {loss.val:.4f} ({loss.avg:.4f})'.format(
                    epoch, loss=losses))
            loader.set_description(description)

            steps_completed = batch_index + 1
            if steps_completed % args.log_freq == 0:
                stats_dict = OrderedDict()
                stats_dict['Loss'] = losses.avg
                stats = ('Performance/Training/', stats_dict)
                params = model.named_parameters(
                ) if args.log_params_histograms else None
                distiller.log_training_progress(stats, params, epoch,
                                                steps_completed,
                                                steps_per_epoch, args.log_freq,
                                                [tflogger])

                tflogger.log_model_buffers(model,
                                           ['tracked_min', 'tracked_max'],
                                           'Quant/Train/Acts/TrackedMinMax',
                                           epoch, steps_completed,
                                           steps_per_epoch, args.log_freq)

        train_time = time.time() - begin
        begin = time.time()
        hits, ndcgs = val_epoch(model,
                                test_ratings,
                                test_negs,
                                args.topk,
                                use_cuda=use_cuda,
                                output=valid_results_file,
                                epoch=epoch,
                                processes=args.processes)
        val_time = time.time() - begin

        if compression_scheduler:
            compression_scheduler.on_epoch_end(epoch, optimizer)

        hit_rate = np.mean(hits)
        mean_ndcgs = np.mean(ndcgs)

        stats_dict = OrderedDict()
        stats_dict['HR@{0}'.format(args.topk)] = hit_rate
        stats_dict['NDCG@{0}'.format(args.topk)] = mean_ndcgs
        stats = ('Performance/Validation/', stats_dict)
        distiller.log_training_progress(stats,
                                        None,
                                        epoch,
                                        steps_completed=0,
                                        total_steps=1,
                                        log_freq=1,
                                        loggers=[tflogger])

        msglogger.info(
            'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}, AvgTrainLoss = {loss.avg:.4f}, '
            'train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format(
                epoch=epoch,
                K=args.topk,
                hit_rate=hit_rate,
                ndcg=mean_ndcgs,
                loss=losses,
                train_time=train_time,
                val_time=val_time))

        is_best = False
        if hit_rate > best_hit_rate:
            best_hit_rate = hit_rate
            is_best = True
            best_epoch = epoch
        extras = {
            'current_hr@10': hit_rate,
            'best_hr@10': best_hit_rate,
            'best_epoch': best_epoch
        }
        apputils.save_checkpoint(epoch,
                                 'NCF',
                                 model,
                                 optimizer,
                                 compression_scheduler,
                                 extras,
                                 is_best,
                                 dir=run_dir)

        if args.threshold is not None:
            if np.mean(hits) >= args.threshold:
                msglogger.info("Hit threshold of {}".format(args.threshold))
                break