def __init__(self, train_fname, nb_neg):
        self._load_train_matrix(train_fname)
        self.nb_neg = nb_neg

        mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_TRAIN_NEG_GEN,
                             value=nb_neg)
        mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT)
Exemple #2
0
    def __init__(self, train_fname, data_summary_fname, nb_neg):
        data_summary = pd.read_csv(data_summary_fname, sep=',', header=0)
        self.nb_users = data_summary.ix[0]['users']
        self.nb_items = data_summary.ix[0]['items']
        self._load_train_matrix(train_fname)
        self.nb_neg = nb_neg

        mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_TRAIN_NEG_GEN,
                             value=nb_neg)
        mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT)
Exemple #3
0
def main():
    args = parse_args()
    # generate train examples and save.
    mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_TRAIN_NEG_GEN,
                         value=args.nb_neg)
    train_generator = _train_generator(
        os.path.join(args.output, TRAIN_RATINGS_FILENAME), args.nb_neg)
    train_df = pd.DataFrame(train_generator)
    train_df.to_csv(os.path.join(args.output, TRAIN_DATASET_FILENAME),
                    index=False,
                    header=False,
                    sep='\t')
Exemple #4
0
def val_epoch(model,
              ratings,
              negs,
              K,
              use_cuda=True,
              output=None,
              epoch=None,
              processes=1):
    if epoch is None:
        print("Initial evaluation")
    else:
        print("Epoch {} evaluation".format(epoch))

    mlperf_log.ncf_print(key=mlperf_log.EVAL_START, value=epoch)
    start = datetime.now()
    model.eval()
    if processes > 1:
        context = mp.get_context('spawn')
        _eval_one = partial(eval_one, model=model, K=K, use_cuda=use_cuda)
        with context.Pool(processes=processes) as workers:
            hits_ndcg_numpred = workers.starmap(_eval_one, zip(ratings, negs))
        hits, ndcgs, num_preds = zip(*hits_ndcg_numpred)
    else:
        hits, ndcgs, num_preds = [], [], []
        for rating, items in zip(ratings, negs):
            hit, ndcg, num_pred = eval_one(rating,
                                           items,
                                           model,
                                           K,
                                           use_cuda=use_cuda)
            hits.append(hit)
            ndcgs.append(ndcg)
            num_preds.append(num_pred)

    hits = np.array(hits, dtype=np.float32)
    ndcgs = np.array(ndcgs, dtype=np.float32)

    assert len(set(num_preds)) == 1
    num_neg = num_preds[0] - 1  # one true positive, many negatives
    mlperf_log.ncf_print(key=mlperf_log.EVAL_SIZE,
                         value={
                             "epoch": epoch,
                             "value": len(hits) * (1 + num_neg)
                         })
    mlperf_log.ncf_print(key=mlperf_log.EVAL_HP_NUM_USERS, value=len(hits))
    mlperf_log.ncf_print(key=mlperf_log.EVAL_HP_NUM_NEG, value=num_neg)

    end = datetime.now()
    if output is not None:
        result = OrderedDict()
        result['timestamp'] = datetime.now()
        result['duration'] = end - start
        result['epoch'] = epoch
        result['K'] = K
        result['hit_rate'] = np.mean(hits)
        result['NDCG'] = np.mean(ndcgs)
        utils.save_result(result, output)

    return hits, ndcgs
Exemple #5
0
    def __init__(self, nb_users, nb_items, mf_dim, mf_reg, mlp_layer_sizes,
                 mlp_layer_regs):
        if len(mlp_layer_sizes) != len(mlp_layer_regs):
            raise RuntimeError('u dummy, layer_sizes != layer_regs!')
        if mlp_layer_sizes[0] % 2 != 0:
            raise RuntimeError('u dummy, mlp_layer_sizes[0] % 2 != 0')
        super(NeuMF, self).__init__()
        nb_mlp_layers = len(mlp_layer_sizes)

        mlperf_log.ncf_print(key=mlperf_log.MODEL_HP_MF_DIM, value=mf_dim)

        # TODO: regularization?
        self.mf_user_embed = nn.Embedding(nb_users, mf_dim)
        self.mf_item_embed = nn.Embedding(nb_items, mf_dim)
        self.mlp_user_embed = nn.Embedding(nb_users, mlp_layer_sizes[0] // 2)
        self.mlp_item_embed = nn.Embedding(nb_items, mlp_layer_sizes[0] // 2)

        mlperf_log.ncf_print(key=mlperf_log.MODEL_HP_MLP_LAYER_SIZES,
                             value=mlp_layer_sizes)
        self.mlp = nn.ModuleList()
        for i in range(1, nb_mlp_layers):
            self.mlp.extend([
                nn.Linear(mlp_layer_sizes[i - 1], mlp_layer_sizes[i]),
                nn.ReLU()
            ])  # noqa: E501

        # self.final = nn.Linear(mlp_layer_sizes[-1] + mf_dim, 1)
        self.final_mf = nn.Linear(mf_dim, 1)
        self.final_mlp = nn.Linear(mlp_layer_sizes[-1], 1)

        self.mf_user_embed.weight.data.normal_(0., 0.01)
        self.mf_item_embed.weight.data.normal_(0., 0.01)
        self.mlp_user_embed.weight.data.normal_(0., 0.01)
        self.mlp_item_embed.weight.data.normal_(0., 0.01)

        def golorot_uniform(layer):
            fan_in, fan_out = layer.in_features, layer.out_features
            limit = np.sqrt(6. / (fan_in + fan_out))
            layer.weight.data.uniform_(-limit, limit)

        def lecunn_uniform(layer):
            fan_in, fan_out = layer.in_features, layer.out_features  # noqa: F841, E501
            limit = np.sqrt(3. / fan_in)
            layer.weight.data.uniform_(-limit, limit)

        for layer in self.mlp:
            if type(layer) != nn.Linear:
                continue
            golorot_uniform(layer)
Exemple #6
0
def val_epoch(model, x, y, dup_mask, real_indices, K, samples_per_user, num_user, output=None,
              epoch=None, loss=None, use_cuda=False):

    start = datetime.now()
    log_2 = math.log(2)

    model.eval()

    hits = torch.tensor(0.)
    ndcg = torch.tensor(0.)

    if use_cuda:
        hits = torch.tensor(0., device='cuda')
        ndcg = torch.tensor(0., device='cuda')
    else:
        hits = torch.tensor(0.)
        ndcg = torch.tensor(0.)


    with torch.no_grad():
        for i, (u,n) in enumerate(zip(x,y)):
            if use_cuda: res = model(u.cuda().view(-1), n.cuda().view(-1), sigmoid=True).detach().view(-1,samples_per_user)
            else: res = model(u.cpu().view(-1), n.cpu().view(-1), sigmoid=True).detach().view(-1,samples_per_user)
            # set duplicate results for the same item to -1 before topk
            res[dup_mask[i]] = -1
            out = torch.topk(res,K)[1]
            # topk in pytorch is stable(if not sort)
            # key(item):value(predicetion) pairs are ordered as original key(item) order
            # so we need the first position of real item(stored in real_indices) to check if it is in topk


            if use_cuda: ifzero = (out == real_indices[i].cuda().view(-1,1))
            else: ifzero = (out == real_indices[i].cpu().view(-1,1))

            hits += ifzero.sum()
            ndcg += (log_2 / (torch.nonzero(ifzero)[:,1].view(-1).to(torch.float)+2).log_()).sum()

    mlperf_log.ncf_print(key=mlperf_log.EVAL_SIZE, value={"epoch": epoch, "value": num_user * samples_per_user})
    mlperf_log.ncf_print(key=mlperf_log.EVAL_HP_NUM_USERS, value=num_user)
    mlperf_log.ncf_print(key=mlperf_log.EVAL_HP_NUM_NEG, value=samples_per_user - 1)

    end = datetime.now()

    hits = hits.item()
    ndcg = ndcg.item()

    if output is not None:
        result = OrderedDict()
        result['timestamp'] = datetime.now()
        result['duration'] = end - start
        result['epoch'] = epoch
        result['K'] = K
        result['hit_rate'] = hits/num_user
        result['NDCG'] = ndcg/num_user
        result['loss'] = loss
        utils.save_result(result, output)

    return hits/num_user, ndcg/num_user
def main():
    args = parse_args()

    print("Loading raw data from {}".format(args.path))
    df = implicit_load(args.path, sort=False)

    print("Filtering out users with less than {} ratings".format(MIN_RATINGS))
    grouped = df.groupby(USER_COLUMN)
    mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_MIN_RATINGS,
                         value=MIN_RATINGS)
    df = grouped.filter(lambda x: len(x) >= MIN_RATINGS)

    print("Mapping original user and item IDs to new sequential IDs")
    df[USER_COLUMN] = pd.factorize(df[USER_COLUMN])[0]
    df[ITEM_COLUMN] = pd.factorize(df[ITEM_COLUMN])[0]

    print("Creating list of items for each user")
    # Need to sort before popping to get last item
    df.sort_values(by='timestamp', inplace=True)

    # clean up data
    del df['rating'], df['timestamp']
    df = df.drop_duplicates()  # assuming it keeps order

    # now we have filtered and sorted by time data, we can split test data out
    grouped_sorted = df.groupby(USER_COLUMN, group_keys=False)
    test_data = grouped_sorted.tail(1).sort_values(by='user_id')
    # need to pop for each group
    train_data = grouped_sorted.apply(lambda x: x.iloc[:-1])

    # Note: no way to keep reference training data ordering because use of python set and multi-process
    # It should not matter since it will be later randomized again
    # save train and val data that is fixed.
    train_ratings = torch.from_numpy(train_data.values)
    torch.save(train_ratings, args.output + '/train_ratings.pt')
    test_ratings = torch.from_numpy(test_data.values)
    torch.save(test_ratings, args.output + '/test_ratings.pt')
def val_epoch(model, x, y, dup_mask, real_indices, K, samples_per_user, num_user, output=None,
              epoch=None, distributed=False):

    start = datetime.now()
    log_2 = math.log(2)

    model.eval()

    with torch.no_grad():
        p = []
        for u,n in zip(x,y):
            p.append(model(u, n, sigmoid=True).detach())

        del x
        del y
        temp = torch.cat(p).view(-1,samples_per_user)
        del p
        # set duplicate results for the same item to -1 before topk
        temp[dup_mask] = -1
        out = torch.topk(temp,K)[1]
        # topk in pytorch is stable(if not sort)
        # key(item):value(predicetion) pairs are ordered as original key(item) order
        # so we need the first position of real item(stored in real_indices) to check if it is in topk
        ifzero = (out == real_indices.view(-1,1))
        hits = ifzero.sum()
        ndcg = (log_2 / (torch.nonzero(ifzero)[:,1].view(-1).to(torch.float)+2).log_()).sum()

    mlperf_log.ncf_print(key=mlperf_log.EVAL_SIZE, value={"epoch": epoch, "value": num_user * samples_per_user})
    mlperf_log.ncf_print(key=mlperf_log.EVAL_HP_NUM_USERS, value=num_user)
    mlperf_log.ncf_print(key=mlperf_log.EVAL_HP_NUM_NEG, value=samples_per_user - 1)

    end = datetime.now()

    if distributed:
        torch.distributed.all_reduce(hits, op=torch.distributed.reduce_op.SUM)
        torch.distributed.all_reduce(ndcg, op=torch.distributed.reduce_op.SUM)

    hits = hits.item()
    ndcg = ndcg.item()

    if output is not None:
        result = OrderedDict()
        result['timestamp'] = datetime.now()
        result['duration'] = end - start
        result['epoch'] = epoch
        result['K'] = K
        result['hit_rate'] = hits/num_user
        result['NDCG'] = ndcg/num_user
        utils.save_result(result, output)

    return hits/num_user, ndcg/num_user
Exemple #9
0
def main():

    args = parse_args()
    if args.seed is not None:
        print("Using seed = {}".format(args.seed))
        torch.manual_seed(args.seed)
        np.random.seed(seed=args.seed)

    # Save configuration to file
    config = {k: v for k, v in args.__dict__.items()}
    config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp())
    config['local_timestamp'] = str(datetime.now())
    run_dir = "./run/neumf/{}".format(config['timestamp'])
    print("Saving config and results to {}".format(run_dir))
    if not os.path.exists(run_dir) and run_dir != '':
        os.makedirs(run_dir)
    utils.save_config(config, run_dir)

    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()
    # Check where to put data loader
    if use_cuda:
        dataloader_device = 'cpu' if args.cpu_dataloader else 'cuda'
    else:
        dataloader_device = 'cpu'

    # more like load trigger timmer now
    mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_NUM_EVAL,
                         value=args.valid_negative)
    # The default of np.random.choice is replace=True, so does pytorch random_()
    mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_SAMPLE_EVAL_REPLACEMENT,
                         value=True)
    mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT,
                         value=True)
    mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_EVAL_NEG_GEN)

    # sync worker before timing.
    torch.cuda.synchronize()

    #===========================================================================
    #== The clock starts on loading the preprocessed data. =====================
    #===========================================================================
    mlperf_log.ncf_print(key=mlperf_log.RUN_START)
    run_start_time = time.time()

    print(datetime.now(), "Loading test ratings.")
    test_ratings = [torch.LongTensor()] * args.user_scaling

    for chunk in range(args.user_scaling):
        test_ratings[chunk] = torch.from_numpy(
            np.load(args.data + '/testx' + str(args.user_scaling) + 'x' +
                    str(args.item_scaling) + '_' + str(chunk) + '.npz',
                    encoding='bytes')['arr_0'])

    fn_prefix = args.data + '/' + CACHE_FN.format(args.user_scaling,
                                                  args.item_scaling)
    sampler_cache = fn_prefix + "cached_sampler.pkl"
    print(datetime.now(), "Loading preprocessed sampler.")
    if os.path.exists(args.data):
        print("Using alias file: {}".format(args.data))
        with open(sampler_cache, "rb") as f:
            sampler, pos_users, pos_items, nb_items, _ = pickle.load(f)
    print(datetime.now(), "Alias table loaded.")

    nb_users = len(sampler.num_regions)
    train_users = torch.from_numpy(pos_users).type(torch.LongTensor)
    train_items = torch.from_numpy(pos_items).type(torch.LongTensor)

    mlperf_log.ncf_print(key=mlperf_log.INPUT_SIZE, value=len(train_users))
    # produce things not change between epoch
    # mask for filtering duplicates with real sample
    # note: test data is removed before create mask, same as reference
    # create label
    train_label = torch.ones_like(train_users, dtype=torch.float32)
    neg_label = torch.zeros_like(train_label, dtype=torch.float32)
    neg_label = neg_label.repeat(args.negative_samples)
    train_label = torch.cat((train_label, neg_label))
    del neg_label

    test_pos = [l[:, 1].reshape(-1, 1) for l in test_ratings]
    test_negatives = [torch.LongTensor()] * args.user_scaling
    test_neg_items = [torch.LongTensor()] * args.user_scaling

    print(datetime.now(), "Loading test negatives.")
    for chunk in range(args.user_scaling):
        file_name = (args.data + '/test_negx' + str(args.user_scaling) + 'x' +
                     str(args.item_scaling) + '_' + str(chunk) + '.npz')
        raw_data = np.load(file_name, encoding='bytes')
        test_negatives[chunk] = torch.from_numpy(raw_data['arr_0'])
        print(
            datetime.now(),
            "Test negative chunk {} of {} loaded ({} users).".format(
                chunk + 1, args.user_scaling, test_negatives[chunk].size()))

    test_neg_items = [l[:, 1] for l in test_negatives]

    # create items with real sample at last position
    test_items = [
        torch.cat((a.reshape(-1, args.valid_negative), b), dim=1)
        for a, b in zip(test_neg_items, test_pos)
    ]
    del test_ratings, test_neg_items

    # generate dup mask and real indice for exact same behavior on duplication compare to reference
    # here we need a sort that is stable(keep order of duplicates)
    # this is a version works on integer
    sorted_items, indices = zip(*[torch.sort(l)
                                  for l in test_items])  # [1,1,1,2], [3,1,0,2]
    sum_item_indices = [
        a.float() + b.float() / len(b[0])
        for a, b in zip(sorted_items, indices)
    ]  #[1.75,1.25,1.0,2.5]
    indices_order = [torch.sort(l)[1] for l in sum_item_indices]  #[2,1,0,3]
    stable_indices = [
        torch.gather(a, 1, b) for a, b in zip(indices, indices_order)
    ]  #[0,1,3,2]
    # produce -1 mask
    dup_mask = [(l[:, 0:-1] == l[:, 1:]) for l in sorted_items]
    dup_mask = [
        torch.cat((torch.zeros_like(a, dtype=torch.uint8), b), dim=1)
        for a, b in zip(test_pos, dup_mask)
    ]
    dup_mask = [
        torch.gather(a, 1,
                     b.sort()[1]) for a, b in zip(dup_mask, stable_indices)
    ]
    # produce real sample indices to later check in topk
    sorted_items, indices = zip(*[(a != b).sort()
                                  for a, b in zip(test_items, test_pos)])
    sum_item_indices = [(a.float()) + (b.float()) / len(b[0])
                        for a, b in zip(sorted_items, indices)]
    indices_order = [torch.sort(l)[1] for l in sum_item_indices]
    stable_indices = [
        torch.gather(a, 1, b) for a, b in zip(indices, indices_order)
    ]
    real_indices = [l[:, 0] for l in stable_indices]
    del sorted_items, indices, sum_item_indices, indices_order, stable_indices, test_pos

    # For our dataset, test set is identical to user set, so arange() provides
    # all test users.
    test_users = torch.arange(nb_users, dtype=torch.long)
    test_users = test_users[:, None]
    test_users = test_users + torch.zeros(1 + args.valid_negative,
                                          dtype=torch.long)
    # test_items needs to be of type Long in order to be used in embedding
    test_items = torch.cat(test_items).type(torch.long)

    dup_mask = torch.cat(dup_mask)
    real_indices = torch.cat(real_indices)

    # make pytorch memory behavior more consistent later
    torch.cuda.empty_cache()

    mlperf_log.ncf_print(key=mlperf_log.INPUT_BATCH_SIZE,
                         value=args.batch_size)
    mlperf_log.ncf_print(
        key=mlperf_log.INPUT_ORDER)  # we shuffled later with randperm

    print(
        datetime.now(),
        "Data loading done {:.1f} sec. #user={}, #item={}, #train={}, #test={}"
        .format(time.time() - run_start_time, nb_users, nb_items,
                len(train_users), nb_users))

    # Create model
    model = NeuMF(nb_users,
                  nb_items,
                  mf_dim=args.factors,
                  mf_reg=0.,
                  mlp_layer_sizes=args.layers,
                  mlp_layer_regs=[0. for i in args.layers])
    print(model)
    print("{} parameters".format(utils.count_parameters(model)))

    # Save model text description
    with open(os.path.join(run_dir, 'model.txt'), 'w') as file:
        file.write(str(model))

    # Add optimizer and loss to graph
    params = model.parameters()

    optimizer = torch.optim.Adam(params,
                                 lr=args.learning_rate,
                                 betas=(args.beta1, args.beta2),
                                 eps=args.eps)
    criterion = nn.BCEWithLogitsLoss(
        reduction='none'
    )  # use torch.mean() with dim later to avoid copy to host
    mlperf_log.ncf_print(key=mlperf_log.OPT_LR, value=args.learning_rate)
    mlperf_log.ncf_print(key=mlperf_log.OPT_NAME, value="Adam")
    mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=args.beta1)
    mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=args.beta2)
    mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=args.eps)
    mlperf_log.ncf_print(key=mlperf_log.MODEL_HP_LOSS_FN, value=mlperf_log.BCE)

    if use_cuda:
        # Move model and loss to GPU
        model = model.cuda()
        criterion = criterion.cuda()

    local_batch = args.batch_size
    traced_criterion = torch.jit.trace(
        criterion.forward,
        (torch.rand(local_batch, 1), torch.rand(local_batch, 1)))

    # Create files for tracking training
    valid_results_file = os.path.join(run_dir, 'valid_results.csv')

    # Calculate initial Hit Ratio and NDCG
    samples_per_user = test_items.size(1)
    users_per_valid_batch = args.valid_batch_size // samples_per_user

    test_users = test_users.split(users_per_valid_batch)
    test_items = test_items.split(users_per_valid_batch)
    dup_mask = dup_mask.split(users_per_valid_batch)
    real_indices = real_indices.split(users_per_valid_batch)

    hr, ndcg = val_epoch(model,
                         test_users,
                         test_items,
                         dup_mask,
                         real_indices,
                         args.topk,
                         samples_per_user=samples_per_user,
                         num_user=nb_users)
    print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format(
        K=args.topk, hit_rate=hr, ndcg=ndcg))
    success = False
    mlperf_log.ncf_print(key=mlperf_log.TRAIN_LOOP)
    for epoch in range(args.epochs):

        mlperf_log.ncf_print(key=mlperf_log.TRAIN_EPOCH, value=epoch)
        mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_NUM_NEG,
                             value=args.negative_samples)
        mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_TRAIN_NEG_GEN)
        begin = time.time()

        st = timeit.default_timer()
        if args.random_negatives:
            neg_users = train_users.repeat(args.negative_samples)
            neg_items = torch.empty_like(neg_users, dtype=torch.int64).random_(
                0, nb_items)
        else:
            negatives = generate_negatives(sampler, args.negative_samples,
                                           train_users.numpy())
            negatives = torch.from_numpy(negatives)
            neg_users = negatives[:, 0]
            neg_items = negatives[:, 1]

        print("generate_negatives loop time: {:.2f}",
              timeit.default_timer() - st)

        after_neg_gen = time.time()

        st = timeit.default_timer()
        epoch_users = torch.cat((train_users, neg_users))
        epoch_items = torch.cat((train_items, neg_items))
        del neg_users, neg_items

        # shuffle prepared data and split into batches
        epoch_indices = torch.randperm(len(epoch_users),
                                       device=dataloader_device)
        epoch_size = len(epoch_indices)
        epoch_users = epoch_users[epoch_indices]
        epoch_items = epoch_items[epoch_indices]
        epoch_label = train_label[epoch_indices]
        epoch_users_list = epoch_users.split(local_batch)
        epoch_items_list = epoch_items.split(local_batch)
        epoch_label_list = epoch_label.split(local_batch)

        print("shuffle time: {:.2f}", timeit.default_timer() - st)

        # only print progress bar on rank 0
        num_batches = (epoch_size + args.batch_size - 1) // args.batch_size
        qbar = tqdm.tqdm(range(num_batches))
        # handle extremely rare case where last batch size < number of worker
        if len(epoch_users_list) < num_batches:
            print("epoch_size % batch_size < number of worker!")
            exit(1)

        after_shuffle = time.time()

        neg_gen_time = (after_neg_gen - begin)
        shuffle_time = (after_shuffle - after_neg_gen)

        for i in qbar:
            # selecting input from prepared data
            user = epoch_users_list[i].cuda()
            item = epoch_items_list[i].cuda()
            label = epoch_label_list[i].view(-1, 1).cuda()

            for p in model.parameters():
                p.grad = None

            outputs = model(user, item)
            loss = traced_criterion(outputs, label).float()
            loss = torch.mean(loss.view(-1), 0)

            loss.backward()
            optimizer.step()

        del epoch_users, epoch_items, epoch_label, epoch_users_list, epoch_items_list, epoch_label_list, user, item, label
        train_time = time.time() - begin
        begin = time.time()

        mlperf_log.ncf_print(key=mlperf_log.EVAL_START, value=epoch)

        hr, ndcg = val_epoch(model,
                             test_users,
                             test_items,
                             dup_mask,
                             real_indices,
                             args.topk,
                             samples_per_user=samples_per_user,
                             num_user=nb_users,
                             output=valid_results_file,
                             epoch=epoch,
                             loss=loss.data.item())

        val_time = time.time() - begin
        print(
            'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},'
            ' train_time = {train_time:.2f}, val_time = {val_time:.2f}, loss = {loss:.4f},'
            ' neg_gen: {neg_gen_time:.4f}, shuffle_time: {shuffle_time:.2f}'.
            format(epoch=epoch,
                   K=args.topk,
                   hit_rate=hr,
                   ndcg=ndcg,
                   train_time=train_time,
                   val_time=val_time,
                   loss=loss.data.item(),
                   neg_gen_time=neg_gen_time,
                   shuffle_time=shuffle_time))

        mlperf_log.ncf_print(key=mlperf_log.EVAL_ACCURACY,
                             value={
                                 "epoch": epoch,
                                 "value": hr
                             })
        mlperf_log.ncf_print(key=mlperf_log.EVAL_TARGET, value=args.threshold)
        mlperf_log.ncf_print(key=mlperf_log.EVAL_STOP, value=epoch)

        if args.threshold is not None:
            if hr >= args.threshold:
                print("Hit threshold of {}".format(args.threshold))
                success = True
                break

    mlperf_log.ncf_print(key=mlperf_log.RUN_STOP, value={"success": success})
    run_stop_time = time.time()
    mlperf_log.ncf_print(key=mlperf_log.RUN_FINAL)

    # easy way of tracking mlperf score
    if success:
        print("mlperf_score", run_stop_time - run_start_time)
Exemple #10
0
def main():
    # Note: The run start is in data_preprocess.py

    args = parse_args()
    if args.seed is not None:
        print("Using seed = {}".format(args.seed))
        torch.manual_seed(args.seed)
        np.random.seed(seed=args.seed)

    # Save configuration to file
    config = {k: v for k, v in args.__dict__.items()}
    config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp())
    config['local_timestamp'] = str(datetime.now())
    run_dir = "./run/MGPM/{}/{}".format(
        os.path.basename(os.path.normpath(args.data)), config['timestamp'])
    print("Saving config and results to {}".format(run_dir))
    if not os.path.exists(run_dir) and run_dir != '':
        os.makedirs(run_dir)
    utils.save_config(config, run_dir)

    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    if use_cuda:
        print("Using cuda ...")
    else:
        print("Using CPU ...")

    t1 = time.time()

    best_hit, best_ndcg = 0., 0.
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch

    # Load Data
    print('Loading data')
    print(os.path.join(args.data, TRAIN_RATINGS_FILENAME))
    train_dataset = CFTrainDataset(
        os.path.join(args.data, TRAIN_RATINGS_FILENAME),
        os.path.join(args.data, DATA_SUMMARY_FILENAME), args.negative_samples)

    mlperf_log.ncf_print(key=mlperf_log.INPUT_BATCH_SIZE,
                         value=args.batch_size)
    mlperf_log.ncf_print(
        key=mlperf_log.INPUT_ORDER)  # set shuffle=True in DataLoader
    train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset,
                                                   batch_size=args.batch_size,
                                                   shuffle=True,
                                                   num_workers=args.workers,
                                                   pin_memory=True)
    test_ratings = load_test_ratings(
        os.path.join(args.data, TEST_RATINGS_FILENAME))  # noqa: E501
    test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME))
    nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items
    print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d' %
          (time.time() - t1, nb_users, nb_items, train_dataset.mat.nnz,
           len(test_ratings)))

    # Create model
    model = Multi_Preference_Model(nb_users=nb_users,
                                   nb_items=nb_items,
                                   embed_dim=32,
                                   history_size=9)
    print(model)
    print("{} parameters".format(utils.count_parameters(model)))

    # Save model text description
    with open(os.path.join(run_dir, 'model.txt'), 'w') as file:
        file.write(str(model))

    # Add optimizer and loss to graph
    mlperf_log.ncf_print(key=mlperf_log.OPT_LR, value=args.learning_rate)
    beta1, beta2, epsilon = 0.9, 0.999, 1e-8
    mlperf_log.ncf_print(key=mlperf_log.OPT_NAME, value="Adam")
    mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=beta1)
    mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=beta2)
    mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=epsilon)
    optimizer = torch.optim.Adam(model.parameters(),
                                 betas=(beta1, beta2),
                                 lr=args.learning_rate,
                                 eps=epsilon)

    mlperf_log.ncf_print(key=mlperf_log.MODEL_HP_LOSS_FN, value=mlperf_log.BCE)
    # optimizer = torch.optim.SGD(model.parameters(),lr=args.learning_rate,momentum=0.9)
    criterion = nn.BCEWithLogitsLoss()

    if use_cuda:
        # Move model and loss to GPU
        model = model.cuda()
        criterion = criterion.cuda()

    if args.resume:
        # Load checkpoint.
        print('==> Resuming from checkpoint..')
        assert os.path.isdir(
            'checkpoint'), 'Error: no checkpoint directory found!'
        checkpoint = torch.load('./checkpoint/' + model._get_name() + '.pd')
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch']
        best_hit = checkpoint['hit']
        best_ndcg = checkpoint['ndcg']

    # Create files for tracking training
    valid_results_file = os.path.join(run_dir, 'valid_results.csv')

    # Calculate initial Hit Ratio and NDCG
    if start_epoch == 0:
        hits, ndcgs = val_epoch(model,
                                test_ratings,
                                test_negs,
                                args.topk,
                                use_cuda=use_cuda,
                                processes=args.processes)
        print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format(
            K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs)))

    mlperf_log.ncf_print(key=mlperf_log.TRAIN_LOOP)
    for epoch in range(start_epoch, args.epochs):
        mlperf_log.ncf_print(key=mlperf_log.TRAIN_EPOCH, value=epoch)
        model.train()
        losses = utils.AverageMeter()

        mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_NUM_NEG,
                             value=train_dataset.nb_neg)
        mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_TRAIN_NEG_GEN)
        begin = time.time()
        loader = tqdm.tqdm(train_dataloader)
        for batch_index, (user, item, history, label) in enumerate(loader):
            user = torch.autograd.Variable(user, requires_grad=False)
            item = torch.autograd.Variable(item, requires_grad=False)
            history = torch.autograd.Variable(history, requires_grad=False)
            label = torch.autograd.Variable(label, requires_grad=False)
            if use_cuda:
                user = user.cuda()
                item = item.cuda()
                history = history.cuda()
                label = label.cuda()

            # outputs, _ = model(user, item,history)
            outputs = model(user, item, history)
            loss = criterion(outputs, label)
            losses.update(loss.data.item(), user.size(0))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Save stats to file
            description = (
                'Epoch {} Loss {loss.val:.4f} ({loss.avg:.4f})'.format(
                    epoch, loss=losses))
            loader.set_description(description)

        train_time = time.time() - begin
        begin = time.time()
        hits, ndcgs = val_epoch(model,
                                test_ratings,
                                test_negs,
                                args.topk,
                                use_cuda=use_cuda,
                                output=valid_results_file,
                                epoch=epoch,
                                processes=args.processes)
        mlperf_log.ncf_print(key=mlperf_log.EVAL_ACCURACY,
                             value={
                                 "epoch": epoch,
                                 "value": float(np.mean(hits))
                             })
        mlperf_log.ncf_print(key=mlperf_log.EVAL_STOP)
        val_time = time.time() - begin
        print(
            'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},'
            ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format(
                epoch=epoch,
                K=args.topk,
                hit_rate=np.mean(hits),
                ndcg=np.mean(ndcgs),
                train_time=train_time,
                val_time=val_time))
        if np.mean(hits) >= best_hit or np.mean(ndcgs) >= best_ndcg:
            best_hit = np.mean(hits)
            best_ndcg = np.mean(ndcgs)
            # Save checkpoint.
            print('Saving checkpoint..')
            state = {
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'hit': best_hit,
                'ndcg': best_ndcg,
            }
            if not os.path.isdir('checkpoint'):
                os.mkdir('checkpoint')
            torch.save(state, './checkpoint/' + model._get_name() + '.pd')

    print("Best hit: ", best_hit)
    print("Best_ndcg: ", best_ndcg)

    mlperf_log.ncf_print(key=mlperf_log.RUN_STOP)
    mlperf_log.ncf_print(key=mlperf_log.RUN_FINAL)
Exemple #11
0
def main():
    # Note: The run start is in convert.py

    args = parse_args()
    if args.seed is not None:
        print("Using seed = {}".format(args.seed))
        torch.manual_seed(args.seed)
        np.random.seed(seed=args.seed)

    # Save configuration to file
    config = {k: v for k, v in args.__dict__.items()}
    config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp())
    config['local_timestamp'] = str(datetime.now())
    # File is located in a subdirectory of the data directory
    run_dir = os.path.join(args.data,
                           "run_neumf_{}".format(config['timestamp']))
    print("Saving config and results to {}".format(run_dir))
    if not os.path.exists(run_dir) and run_dir != '':
        os.makedirs(run_dir)
    utils.save_config(config, run_dir)

    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    t1 = time.time()
    # Load Data
    print('Loading data')
    train_dataset = CFTrainDataset(
        os.path.join(args.data, TRAIN_RATINGS_FILENAME), args.negative_samples)

    mlperf_log.ncf_print(key=mlperf_log.INPUT_BATCH_SIZE,
                         value=args.batch_size)
    # set shuffle=True in DataLoader
    mlperf_log.ncf_print(key=mlperf_log.INPUT_ORDER)
    train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset,
                                                   batch_size=args.batch_size,
                                                   shuffle=True,
                                                   num_workers=args.workers,
                                                   pin_memory=True)
    test_ratings = load_test_ratings(
        os.path.join(args.data, TEST_RATINGS_FILENAME))  # noqa: E501
    test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME))
    nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items
    print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d' %
          (time.time() - t1, nb_users, nb_items, train_dataset.mat.nnz,
           len(test_ratings)))

    # Create model
    model = NeuMF(nb_users,
                  nb_items,
                  mf_dim=args.factors,
                  mf_reg=0.,
                  mlp_layer_sizes=args.layers,
                  mlp_layer_regs=[0. for i in args.layers])
    print(model)
    print("{} parameters".format(utils.count_parameters(model)))

    # Save model text description
    with open(os.path.join(run_dir, 'model.txt'), 'w') as file:
        file.write(str(model))

    # Add optimizer and loss to graph
    mlperf_log.ncf_print(key=mlperf_log.OPT_LR, value=args.learning_rate)
    beta1, beta2, epsilon = 0.9, 0.999, 1e-8
    mlperf_log.ncf_print(key=mlperf_log.OPT_NAME, value="Adam")
    mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=beta1)
    mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=beta2)
    mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=epsilon)
    optimizer = torch.optim.Adam(model.parameters(),
                                 betas=(beta1, beta2),
                                 lr=args.learning_rate,
                                 eps=epsilon)

    mlperf_log.ncf_print(key=mlperf_log.MODEL_HP_LOSS_FN, value=mlperf_log.BCE)
    criterion = nn.BCEWithLogitsLoss()

    if use_cuda:
        # Move model and loss to GPU
        model = model.cuda()
        criterion = criterion.cuda()

    # Create files for tracking training
    valid_results_file = os.path.join(run_dir, 'valid_results.csv')

    # Calculate initial Hit Ratio and NDCG
    hits, ndcgs = val_epoch(model,
                            test_ratings,
                            test_negs,
                            args.topk,
                            use_cuda=use_cuda,
                            processes=args.processes)
    print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format(
        K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs)))

    success = False
    mlperf_log.ncf_print(key=mlperf_log.TRAIN_LOOP)
    for epoch in range(args.epochs):
        mlperf_log.ncf_print(key=mlperf_log.TRAIN_EPOCH, value=epoch)
        model.train()
        losses = utils.AverageMeter()

        mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_NUM_NEG,
                             value=train_dataset.nb_neg)
        mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_TRAIN_NEG_GEN)
        begin = time.time()
        loader = tqdm.tqdm(train_dataloader, disable=args.progress_bar)
        for batch_index, (user, item, label) in enumerate(loader):
            user = torch.autograd.Variable(user, requires_grad=False)
            item = torch.autograd.Variable(item, requires_grad=False)
            label = torch.autograd.Variable(label, requires_grad=False)
            if use_cuda:
                user = user.cuda()
                item = item.cuda()
                label = label.cuda()

            outputs = model(user, item)
            loss = criterion(outputs, label)
            losses.update(loss.data.item(), user.size(0))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Save stats to file
            description = (
                'Epoch {} Loss {loss.val:.4f} ({loss.avg:.4f})'.format(
                    epoch, loss=losses))
            loader.set_description(description)

        train_time = time.time() - begin
        begin = time.time()
        hits, ndcgs = val_epoch(model,
                                test_ratings,
                                test_negs,
                                args.topk,
                                use_cuda=use_cuda,
                                output=valid_results_file,
                                epoch=epoch,
                                processes=args.processes)
        mlperf_log.ncf_print(key=mlperf_log.EVAL_ACCURACY,
                             value={
                                 "epoch": epoch,
                                 "value": float(np.mean(hits))
                             })
        mlperf_log.ncf_print(key=mlperf_log.EVAL_TARGET, value=args.threshold)
        mlperf_log.ncf_print(key=mlperf_log.EVAL_STOP)
        val_time = time.time() - begin
        print(
            'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},'
            ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format(
                epoch=epoch,
                K=args.topk,
                hit_rate=np.mean(hits),
                ndcg=np.mean(ndcgs),
                train_time=train_time,
                val_time=val_time))
        if args.threshold is not None:
            if np.mean(hits) >= args.threshold:
                print("Hit threshold of {}".format(args.threshold))
                success = True
                break

    mlperf_log.ncf_print(key=mlperf_log.RUN_STOP, value={"success": success})
    mlperf_log.ncf_print(key=mlperf_log.RUN_FINAL)
Exemple #12
0
def main():
    args = parse_args()
    np.random.seed(args.seed)

    print("Loading raw data from {}".format(args.path))
    df = implicit_load(args.path, sort=False)

    print("Filtering out users with less than {} ratings".format(MIN_RATINGS))
    grouped = df.groupby(USER_COLUMN)
    mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_MIN_RATINGS,
                         value=MIN_RATINGS)
    df = grouped.filter(lambda x: len(x) >= MIN_RATINGS)

    print("Mapping original user and item IDs to new sequential IDs")
    original_users = df[USER_COLUMN].unique()
    original_items = df[ITEM_COLUMN].unique()

    user_map = {user: index for index, user in enumerate(original_users)}
    item_map = {item: index for index, item in enumerate(original_items)}

    df[USER_COLUMN] = df[USER_COLUMN].apply(lambda user: user_map[user])
    df[ITEM_COLUMN] = df[ITEM_COLUMN].apply(lambda item: item_map[item])

    assert df[USER_COLUMN].max() == len(original_users) - 1
    assert df[ITEM_COLUMN].max() == len(original_items) - 1

    print("Creating list of items for each user")
    # Need to sort before popping to get last item
    df.sort_values(by='timestamp', inplace=True)
    all_ratings = set(zip(df[USER_COLUMN], df[ITEM_COLUMN]))
    user_to_items = defaultdict(list)
    for row in tqdm(df.itertuples(), desc='Ratings', total=len(df)):
        user_to_items[getattr(row, USER_COLUMN)].append(
            getattr(row, ITEM_COLUMN))  # noqa: E501

    test_ratings = []
    test_negs = []
    all_items = set(range(len(original_items)))

    print("Generating {} negative samples for each user".format(
        args.negatives))
    mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_NUM_EVAL,
                         value=args.negatives)

    # The default of np.random.choice is replace=True
    mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_SAMPLE_EVAL_REPLACEMENT,
                         value=True)

    #===========================================================================
    #== First random operation triggers the clock start. =======================
    #===========================================================================
    mlperf_log.ncf_print(key=mlperf_log.RUN_START)
    mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_EVAL_NEG_GEN)

    for user in tqdm(range(len(original_users)),
                     desc='Users',
                     total=len(original_users)):  # noqa: E501
        test_item = user_to_items[user].pop()

        all_ratings.remove((user, test_item))
        all_negs = all_items - set(user_to_items[user])
        all_negs = sorted(list(all_negs))  # determinism

        test_ratings.append((user, test_item))
        test_negs.append(list(np.random.choice(all_negs, args.negatives)))

    print("Saving train and test CSV files to {}".format(args.output))
    df_train_ratings = pd.DataFrame(list(all_ratings))
    df_train_ratings['fake_rating'] = 1
    df_train_ratings.to_csv(os.path.join(args.output, TRAIN_RATINGS_FILENAME),
                            index=False,
                            header=False,
                            sep='\t')

    mlperf_log.ncf_print(key=mlperf_log.INPUT_SIZE,
                         value=len(df_train_ratings))

    df_test_ratings = pd.DataFrame(test_ratings)
    df_test_ratings['fake_rating'] = 1
    df_test_ratings.to_csv(os.path.join(args.output, TEST_RATINGS_FILENAME),
                           index=False,
                           header=False,
                           sep='\t')

    df_test_negs = pd.DataFrame(test_negs)
    df_test_negs.to_csv(os.path.join(args.output, TEST_NEG_FILENAME),
                        index=False,
                        header=False,
                        sep='\t')
def main():

    args = parse_args()
    args.distributed, args.world_size = init_distributed(args.local_rank)
    if args.seed is not None:
        print("Using seed = {}".format(args.seed))
        torch.manual_seed(args.seed)

    # Save configuration to file
    config = {k: v for k, v in args.__dict__.items()}
    config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp())
    config['local_timestamp'] = str(datetime.now())
    run_dir = "./run/neumf/{}.{}".format(config['timestamp'],args.local_rank)
    print("Saving config and results to {}".format(run_dir))
    if not os.path.exists(run_dir) and run_dir != '':
        os.makedirs(run_dir)
    utils.save_config(config, run_dir)

    # Check that GPUs are actually available
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    # more like load trigger timmer now
    mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_NUM_EVAL, value=args.valid_negative)
    # The default of np.random.choice is replace=True, so does pytorch random_()
    mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_SAMPLE_EVAL_REPLACEMENT, value=True)
    mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT)
    mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_EVAL_NEG_GEN)

    # sync worker before timing.
    if args.distributed:
        torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0)
    torch.cuda.synchronize()

    #===========================================================================
    #== The clock starts on loading the preprocessed data. =====================
    #===========================================================================
    mlperf_log.ncf_print(key=mlperf_log.RUN_START)
    run_start_time = time.time()

    # load not converted data, just seperate one for test
    train_ratings = torch.load(args.data+'/train_ratings.pt', map_location=torch.device('cuda:{}'.format(args.local_rank)))
    test_ratings = torch.load(args.data+'/test_ratings.pt', map_location=torch.device('cuda:{}'.format(args.local_rank)))

    # get input data
    # get dims
    nb_maxs = torch.max(train_ratings, 0)[0]
    nb_users = nb_maxs[0].item()+1
    nb_items = nb_maxs[1].item()+1
    train_users = train_ratings[:,0]
    train_items = train_ratings[:,1]
    del nb_maxs, train_ratings
    mlperf_log.ncf_print(key=mlperf_log.INPUT_SIZE, value=len(train_users))
    # produce things not change between epoch
    # mask for filtering duplicates with real sample
    # note: test data is removed before create mask, same as reference
    mat = torch.cuda.ByteTensor(nb_users, nb_items).fill_(1)
    mat[train_users, train_items] = 0
    # create label
    train_label = torch.ones_like(train_users, dtype=torch.float32)
    neg_label = torch.zeros_like(train_label, dtype=torch.float32)
    neg_label = neg_label.repeat(args.negative_samples)
    train_label = torch.cat((train_label,neg_label))
    del neg_label
    if args.fp16:
        train_label = train_label.half()

    # produce validation negative sample on GPU
    all_test_users = test_ratings.shape[0]

    test_users = test_ratings[:,0]
    test_pos = test_ratings[:,1].reshape(-1,1)
    test_negs = generate_neg(test_users, mat, nb_items, args.valid_negative, True)[1]

    # create items with real sample at last position
    test_users = test_users.reshape(-1,1).repeat(1,1+args.valid_negative)
    test_items = torch.cat((test_negs.reshape(-1,args.valid_negative), test_pos), dim=1)
    del test_ratings, test_negs

    # generate dup mask and real indice for exact same behavior on duplication compare to reference
    # here we need a sort that is stable(keep order of duplicates)
    # this is a version works on integer
    sorted_items, indices = torch.sort(test_items) # [1,1,1,2], [3,1,0,2]
    sum_item_indices = sorted_items.float()+indices.float()/len(indices[0]) #[1.75,1.25,1.0,2.5]
    indices_order = torch.sort(sum_item_indices)[1] #[2,1,0,3]
    stable_indices = torch.gather(indices, 1, indices_order) #[0,1,3,2]
    # produce -1 mask
    dup_mask = (sorted_items[:,0:-1] == sorted_items[:,1:])
    dup_mask = torch.cat((torch.zeros_like(test_pos, dtype=torch.uint8), dup_mask),dim=1)
    dup_mask = torch.gather(dup_mask,1,stable_indices.sort()[1])
    # produce real sample indices to later check in topk
    sorted_items, indices = (test_items != test_pos).sort()
    sum_item_indices = sorted_items.float()+indices.float()/len(indices[0])
    indices_order = torch.sort(sum_item_indices)[1]
    stable_indices = torch.gather(indices, 1, indices_order)
    real_indices = stable_indices[:,0]
    del sorted_items, indices, sum_item_indices, indices_order, stable_indices, test_pos

    if args.distributed:
        test_users = torch.chunk(test_users, args.world_size)[args.local_rank]
        test_items = torch.chunk(test_items, args.world_size)[args.local_rank]
        dup_mask = torch.chunk(dup_mask, args.world_size)[args.local_rank]
        real_indices = torch.chunk(real_indices, args.world_size)[args.local_rank]

    # make pytorch memory behavior more consistent later
    torch.cuda.empty_cache()

    mlperf_log.ncf_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size)
    mlperf_log.ncf_print(key=mlperf_log.INPUT_ORDER)  # we shuffled later with randperm

    print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d'
          % (time.time()-run_start_time, nb_users, nb_items, len(train_users),
             nb_users))

    # Create model
    model = NeuMF(nb_users, nb_items,
                  mf_dim=args.factors, mf_reg=0.,
                  mlp_layer_sizes=args.layers,
                  mlp_layer_regs=[0. for i in args.layers])

    if args.fp16:
        model = model.half()

    print(model)
    print("{} parameters".format(utils.count_parameters(model)))

    # Save model text description
    with open(os.path.join(run_dir, 'model.txt'), 'w') as file:
        file.write(str(model))

    # Add optimizer and loss to graph
    if args.fp16:
        fp_optimizer = Fp16Optimizer(model, args.loss_scale)
        params = fp_optimizer.fp32_params
    else:
        params = model.parameters()

    #optimizer = torch.optim.Adam(params, lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps)
    # optimizer = AdamOpt(params, lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps)
    optimizer = FusedAdam(params, lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps, eps_inside_sqrt=False)
    criterion = nn.BCEWithLogitsLoss(reduction = 'none') # use torch.mean() with dim later to avoid copy to host
    mlperf_log.ncf_print(key=mlperf_log.OPT_LR, value=args.learning_rate)
    mlperf_log.ncf_print(key=mlperf_log.OPT_NAME, value="Adam")
    mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=args.beta1)
    mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=args.beta2)
    mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=args.eps)
    mlperf_log.ncf_print(key=mlperf_log.MODEL_HP_LOSS_FN, value=mlperf_log.BCE)

    if use_cuda:
        # Move model and loss to GPU
        model = model.cuda()
        criterion = criterion.cuda()

    if args.distributed:
        model = DDP(model)
        local_batch = args.batch_size // int(os.environ['WORLD_SIZE'])
    else:
        local_batch = args.batch_size
    traced_criterion = torch.jit.trace(criterion.forward, (torch.rand(local_batch,1),torch.rand(local_batch,1)))

    # Create files for tracking training
    valid_results_file = os.path.join(run_dir, 'valid_results.csv')
    # Calculate initial Hit Ratio and NDCG
    test_x = test_users.view(-1).split(args.valid_batch_size)
    test_y = test_items.view(-1).split(args.valid_batch_size)

    hr, ndcg = val_epoch(model, test_x, test_y, dup_mask, real_indices, args.topk, samples_per_user=test_items.size(1),
                         num_user=all_test_users, distributed=args.distributed)
    print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'
          .format(K=args.topk, hit_rate=hr, ndcg=ndcg))
    success = False
    mlperf_log.ncf_print(key=mlperf_log.TRAIN_LOOP)
    for epoch in range(args.epochs):

        mlperf_log.ncf_print(key=mlperf_log.TRAIN_EPOCH, value=epoch)
        mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_NUM_NEG, value=args.negative_samples)
        mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_TRAIN_NEG_GEN)

        begin = time.time()

        # prepare data for epoch
        neg_users, neg_items = generate_neg(train_users, mat, nb_items, args.negative_samples)
        epoch_users = torch.cat((train_users,neg_users))
        epoch_items = torch.cat((train_items,neg_items))
        del neg_users, neg_items

        # shuffle prepared data and split into batches
        epoch_indices = torch.randperm(len(epoch_users), device='cuda:{}'.format(args.local_rank))
        epoch_users = epoch_users[epoch_indices]
        epoch_items = epoch_items[epoch_indices]
        epoch_label = train_label[epoch_indices]
        if args.distributed:
            epoch_users = torch.chunk(epoch_users, args.world_size)[args.local_rank]
            epoch_items = torch.chunk(epoch_items, args.world_size)[args.local_rank]
            epoch_label = torch.chunk(epoch_label, args.world_size)[args.local_rank]
        epoch_users_list = epoch_users.split(local_batch)
        epoch_items_list = epoch_items.split(local_batch)
        epoch_label_list = epoch_label.split(local_batch)

        # only print progress bar on rank 0
        num_batches = (len(epoch_indices) + args.batch_size - 1) // args.batch_size
        if args.local_rank == 0:
            qbar = tqdm.tqdm(range(num_batches))
        else:
            qbar = range(num_batches)
        # handle extremely rare case where last batch size < number of worker
        if len(epoch_users_list) < num_batches:
            print("epoch_size % batch_size < number of worker!")
            exit(1)

        for i in qbar:
            # selecting input from prepared data
            user = epoch_users_list[i]
            item = epoch_items_list[i]
            label = epoch_label_list[i].view(-1,1)

            for p in model.parameters():
                p.grad = None

            outputs = model(user, item)
            loss = traced_criterion(outputs, label).float()
            loss = torch.mean(loss.view(-1), 0)

            if args.fp16:
                fp_optimizer.step(loss, optimizer)
            else:
                loss.backward()
                optimizer.step()

        del epoch_users, epoch_items, epoch_label, epoch_users_list, epoch_items_list, epoch_label_list, user, item, label
        train_time = time.time() - begin
        begin = time.time()

        mlperf_log.ncf_print(key=mlperf_log.EVAL_START)

        hr, ndcg = val_epoch(model, test_x, test_y, dup_mask, real_indices, args.topk, samples_per_user=test_items.size(1),
                             num_user=all_test_users, output=valid_results_file, epoch=epoch, distributed=args.distributed)

        val_time = time.time() - begin
        print('Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},'
              ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'
              .format(epoch=epoch, K=args.topk, hit_rate=hr,
                      ndcg=ndcg, train_time=train_time,
                      val_time=val_time))

        mlperf_log.ncf_print(key=mlperf_log.EVAL_ACCURACY, value={"epoch": epoch, "value": hr})
        mlperf_log.ncf_print(key=mlperf_log.EVAL_TARGET, value=args.threshold)
        mlperf_log.ncf_print(key=mlperf_log.EVAL_STOP)

        if args.threshold is not None:
            if hr >= args.threshold:
                print("Hit threshold of {}".format(args.threshold))
                success = True
                break

    mlperf_log.ncf_print(key=mlperf_log.RUN_STOP, value={"success": success})
    run_stop_time = time.time()
    mlperf_log.ncf_print(key=mlperf_log.RUN_FINAL)

    # easy way of tracking mlperf score
    if success:
        print("mlperf_score", run_stop_time - run_start_time)
Exemple #14
0
def main():
    args = parse_args()
    np.random.seed(args.seed)

    print("Loading raw data from {}".format(args.file))
    #-------------- MovieLens dataset ------------------------------
    # df = implicit_load(args.file, sort=False)
    #---------------------------------------------------------------

    #------ retailrocket-recommender-system-dataset --------------------
    # df = pd.read_csv(args.file, sep=',', header=0)
    # df.columns = ['timestamp', 'user_id', 'event', 'item_id', 'transaction_id']
    # df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
    #
    #
    # RatingData = namedtuple('RatingData',
    #                         ['items', 'users', 'ratings', 'min_date', 'max_date'])
    # info = RatingData(items=len(df['item_id'].unique()),
    #                   users=len(df['user_id'].unique()),
    #                   ratings=len(df),
    #                   min_date=df['timestamp'].min(),
    #                   max_date=df['timestamp'].max())
    # print("{ratings} ratings on {items} items from {users} users"
    #           " from {min_date} to {max_date}"
    #           .format(**(info._asdict())))
    # #--------------------------------------------------------------------

    #-------------------amazon dataset------------------------
    # df = pd.read_csv(args.file, sep=',', header=None)
    # df.columns = ['user_id', 'item_id', 'rating', 'timestamp']
    # df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
    #
    # RatingData = namedtuple('RatingData',
    #                         ['items', 'users', 'ratings', 'min_date', 'max_date'])
    # info = RatingData(items=len(df['item_id'].unique()),
    #                   users=len(df['user_id'].unique()),
    #                   ratings=len(df),
    #                   min_date=df['timestamp'].min(),
    #                   max_date=df['timestamp'].max())
    # print("{ratings} ratings on {items} items from {users} users"
    #           " from {min_date} to {max_date}"
    #           .format(**(info._asdict())))

    #-------------------------------------------------------------------------

    #------------------- hetrec2011 dataset------------------------
    # df = pd.read_csv(args.file, sep='\t', header=0)
    # df.columns = ['user_id', 'item_id', 'tag_id', 'timestamp']
    # df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
    #
    # RatingData = namedtuple('RatingData',
    #                         ['items', 'users', 'ratings', 'min_date', 'max_date'])
    # info = RatingData(items=len(df['item_id'].unique()),
    #                   users=len(df['user_id'].unique()),
    #                   ratings=len(df),
    #                   min_date=df['timestamp'].min(),
    #                   max_date=df['timestamp'].max())
    # print("{ratings} ratings on {items} items from {users} users"
    #           " from {min_date} to {max_date}"
    #           .format(**(info._asdict())))
    #

    #-------------------------------------------------------------------------

    #------------------- taobao UserBehavior dataset------------------------
    df = pd.read_csv(args.file, sep=',', header=None)
    df.columns = [
        'user_id', 'item_id', 'category_id', 'behavior_type', 'timestamp'
    ]
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')

    RatingData = namedtuple(
        'RatingData', ['items', 'users', 'ratings', 'min_date', 'max_date'])
    info = RatingData(items=len(df['item_id'].unique()),
                      users=len(df['user_id'].unique()),
                      ratings=len(df),
                      min_date=df['timestamp'].min(),
                      max_date=df['timestamp'].max())
    print("{ratings} ratings on {items} items from {users} users"
          " from {min_date} to {max_date}".format(**(info._asdict())))

    #-------------------------------------------------------------------------

    print("Filtering out users with less than {} ratings".format(MIN_RATINGS))
    grouped = df.groupby(USER_COLUMN)
    mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_MIN_RATINGS,
                         value=MIN_RATINGS)
    df = grouped.filter(lambda x: len(x) >= MIN_RATINGS)

    print("Mapping original user and item IDs to new sequential IDs")
    original_users = df[USER_COLUMN].unique()
    original_items = df[ITEM_COLUMN].unique()

    nb_users = len(original_users)
    nb_items = len(original_items)

    user_map = {user: index for index, user in enumerate(original_users)}
    item_map = {item: index for index, item in enumerate(original_items)}

    df[USER_COLUMN] = df[USER_COLUMN].apply(lambda user: user_map[user])
    df[ITEM_COLUMN] = df[ITEM_COLUMN].apply(lambda item: item_map[item])

    # print(df)

    assert df[USER_COLUMN].max() == len(original_users) - 1
    assert df[ITEM_COLUMN].max() == len(original_items) - 1

    print("Creating list of items for each user")
    # Need to sort before popping to get last item
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
    df.sort_values(by='timestamp', inplace=True)
    all_ratings = set(zip(df[USER_COLUMN], df[ITEM_COLUMN]))
    user_to_items = defaultdict(list)
    for row in tqdm(df.itertuples(), desc='Ratings', total=len(df)):
        user_to_items[getattr(row, USER_COLUMN)].append(
            getattr(row, ITEM_COLUMN))  # noqa: E501

    print(len(user_to_items[0]))
    print(user_to_items[0])
    print(user_to_items[0][-args.history_size:])

    print(
        "Generating {} negative samples for each user and creating training set"
        .format(args.negatives))
    mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_NUM_EVAL,
                         value=args.negatives)

    train_ratings = []
    test_ratings = []
    test_negs = []
    all_items = set(range(len(original_items)))

    for key, value in tqdm(user_to_items.items(), total=len(user_to_items)):
        all_negs = all_items - set(value)
        all_negs = sorted(list(all_negs))
        negs = random.sample(all_negs, args.negatives)

        test_item = value.pop()

        tmp = [key, test_item]
        tmp.extend(negs)
        test_negs.append(tmp)

        tmp = [key, test_item]
        tmp.extend(value[-args.history_size:])
        test_ratings.append(tmp)

        while len(value) > args.history_size:
            tgItem = value.pop()
            tmp = [key, tgItem]
            tmp.extend(value[-args.history_size:])
            train_ratings.append(tmp)

    print("\nSaving train and test CSV files to {}".format(args.output))

    df_train_ratings = pd.DataFrame(list(train_ratings))
    df_test_ratings = pd.DataFrame(list(test_ratings))
    df_test_negs = pd.DataFrame(list(test_negs))

    print('Saving data description ...')
    data_summary = pd.DataFrame(
        {
            'users': nb_users,
            'items': nb_items,
            'history_size': HISTORY_SIZE,
            'train_entries': len(df_train_ratings),
            'test': len(df_test_ratings)
        },
        index=[0])
    data_summary.to_csv(os.path.join(args.output, DATA_SUMMARY_FILENAME),
                        header=True,
                        index=False,
                        sep=',')

    df_train_ratings['fake_rating'] = 1
    df_train_ratings.to_csv(os.path.join(args.output, TRAIN_RATINGS_FILENAME),
                            index=False,
                            header=False,
                            sep='\t')

    mlperf_log.ncf_print(key=mlperf_log.INPUT_SIZE,
                         value=len(df_train_ratings))

    df_test_ratings['fake_rating'] = 1
    df_test_ratings.to_csv(os.path.join(args.output, TEST_RATINGS_FILENAME),
                           index=False,
                           header=False,
                           sep='\t')

    df_test_negs.to_csv(os.path.join(args.output, TEST_NEG_FILENAME),
                        index=False,
                        header=False,
                        sep='\t')