def __init__(self, train_fname, nb_neg): self._load_train_matrix(train_fname) self.nb_neg = nb_neg mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_TRAIN_NEG_GEN, value=nb_neg) mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT)
def __init__(self, train_fname, data_summary_fname, nb_neg): data_summary = pd.read_csv(data_summary_fname, sep=',', header=0) self.nb_users = data_summary.ix[0]['users'] self.nb_items = data_summary.ix[0]['items'] self._load_train_matrix(train_fname) self.nb_neg = nb_neg mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_TRAIN_NEG_GEN, value=nb_neg) mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT)
def main(): args = parse_args() # generate train examples and save. mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_TRAIN_NEG_GEN, value=args.nb_neg) train_generator = _train_generator( os.path.join(args.output, TRAIN_RATINGS_FILENAME), args.nb_neg) train_df = pd.DataFrame(train_generator) train_df.to_csv(os.path.join(args.output, TRAIN_DATASET_FILENAME), index=False, header=False, sep='\t')
def val_epoch(model, ratings, negs, K, use_cuda=True, output=None, epoch=None, processes=1): if epoch is None: print("Initial evaluation") else: print("Epoch {} evaluation".format(epoch)) mlperf_log.ncf_print(key=mlperf_log.EVAL_START, value=epoch) start = datetime.now() model.eval() if processes > 1: context = mp.get_context('spawn') _eval_one = partial(eval_one, model=model, K=K, use_cuda=use_cuda) with context.Pool(processes=processes) as workers: hits_ndcg_numpred = workers.starmap(_eval_one, zip(ratings, negs)) hits, ndcgs, num_preds = zip(*hits_ndcg_numpred) else: hits, ndcgs, num_preds = [], [], [] for rating, items in zip(ratings, negs): hit, ndcg, num_pred = eval_one(rating, items, model, K, use_cuda=use_cuda) hits.append(hit) ndcgs.append(ndcg) num_preds.append(num_pred) hits = np.array(hits, dtype=np.float32) ndcgs = np.array(ndcgs, dtype=np.float32) assert len(set(num_preds)) == 1 num_neg = num_preds[0] - 1 # one true positive, many negatives mlperf_log.ncf_print(key=mlperf_log.EVAL_SIZE, value={ "epoch": epoch, "value": len(hits) * (1 + num_neg) }) mlperf_log.ncf_print(key=mlperf_log.EVAL_HP_NUM_USERS, value=len(hits)) mlperf_log.ncf_print(key=mlperf_log.EVAL_HP_NUM_NEG, value=num_neg) end = datetime.now() if output is not None: result = OrderedDict() result['timestamp'] = datetime.now() result['duration'] = end - start result['epoch'] = epoch result['K'] = K result['hit_rate'] = np.mean(hits) result['NDCG'] = np.mean(ndcgs) utils.save_result(result, output) return hits, ndcgs
def __init__(self, nb_users, nb_items, mf_dim, mf_reg, mlp_layer_sizes, mlp_layer_regs): if len(mlp_layer_sizes) != len(mlp_layer_regs): raise RuntimeError('u dummy, layer_sizes != layer_regs!') if mlp_layer_sizes[0] % 2 != 0: raise RuntimeError('u dummy, mlp_layer_sizes[0] % 2 != 0') super(NeuMF, self).__init__() nb_mlp_layers = len(mlp_layer_sizes) mlperf_log.ncf_print(key=mlperf_log.MODEL_HP_MF_DIM, value=mf_dim) # TODO: regularization? self.mf_user_embed = nn.Embedding(nb_users, mf_dim) self.mf_item_embed = nn.Embedding(nb_items, mf_dim) self.mlp_user_embed = nn.Embedding(nb_users, mlp_layer_sizes[0] // 2) self.mlp_item_embed = nn.Embedding(nb_items, mlp_layer_sizes[0] // 2) mlperf_log.ncf_print(key=mlperf_log.MODEL_HP_MLP_LAYER_SIZES, value=mlp_layer_sizes) self.mlp = nn.ModuleList() for i in range(1, nb_mlp_layers): self.mlp.extend([ nn.Linear(mlp_layer_sizes[i - 1], mlp_layer_sizes[i]), nn.ReLU() ]) # noqa: E501 # self.final = nn.Linear(mlp_layer_sizes[-1] + mf_dim, 1) self.final_mf = nn.Linear(mf_dim, 1) self.final_mlp = nn.Linear(mlp_layer_sizes[-1], 1) self.mf_user_embed.weight.data.normal_(0., 0.01) self.mf_item_embed.weight.data.normal_(0., 0.01) self.mlp_user_embed.weight.data.normal_(0., 0.01) self.mlp_item_embed.weight.data.normal_(0., 0.01) def golorot_uniform(layer): fan_in, fan_out = layer.in_features, layer.out_features limit = np.sqrt(6. / (fan_in + fan_out)) layer.weight.data.uniform_(-limit, limit) def lecunn_uniform(layer): fan_in, fan_out = layer.in_features, layer.out_features # noqa: F841, E501 limit = np.sqrt(3. / fan_in) layer.weight.data.uniform_(-limit, limit) for layer in self.mlp: if type(layer) != nn.Linear: continue golorot_uniform(layer)
def val_epoch(model, x, y, dup_mask, real_indices, K, samples_per_user, num_user, output=None, epoch=None, loss=None, use_cuda=False): start = datetime.now() log_2 = math.log(2) model.eval() hits = torch.tensor(0.) ndcg = torch.tensor(0.) if use_cuda: hits = torch.tensor(0., device='cuda') ndcg = torch.tensor(0., device='cuda') else: hits = torch.tensor(0.) ndcg = torch.tensor(0.) with torch.no_grad(): for i, (u,n) in enumerate(zip(x,y)): if use_cuda: res = model(u.cuda().view(-1), n.cuda().view(-1), sigmoid=True).detach().view(-1,samples_per_user) else: res = model(u.cpu().view(-1), n.cpu().view(-1), sigmoid=True).detach().view(-1,samples_per_user) # set duplicate results for the same item to -1 before topk res[dup_mask[i]] = -1 out = torch.topk(res,K)[1] # topk in pytorch is stable(if not sort) # key(item):value(predicetion) pairs are ordered as original key(item) order # so we need the first position of real item(stored in real_indices) to check if it is in topk if use_cuda: ifzero = (out == real_indices[i].cuda().view(-1,1)) else: ifzero = (out == real_indices[i].cpu().view(-1,1)) hits += ifzero.sum() ndcg += (log_2 / (torch.nonzero(ifzero)[:,1].view(-1).to(torch.float)+2).log_()).sum() mlperf_log.ncf_print(key=mlperf_log.EVAL_SIZE, value={"epoch": epoch, "value": num_user * samples_per_user}) mlperf_log.ncf_print(key=mlperf_log.EVAL_HP_NUM_USERS, value=num_user) mlperf_log.ncf_print(key=mlperf_log.EVAL_HP_NUM_NEG, value=samples_per_user - 1) end = datetime.now() hits = hits.item() ndcg = ndcg.item() if output is not None: result = OrderedDict() result['timestamp'] = datetime.now() result['duration'] = end - start result['epoch'] = epoch result['K'] = K result['hit_rate'] = hits/num_user result['NDCG'] = ndcg/num_user result['loss'] = loss utils.save_result(result, output) return hits/num_user, ndcg/num_user
def main(): args = parse_args() print("Loading raw data from {}".format(args.path)) df = implicit_load(args.path, sort=False) print("Filtering out users with less than {} ratings".format(MIN_RATINGS)) grouped = df.groupby(USER_COLUMN) mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_MIN_RATINGS, value=MIN_RATINGS) df = grouped.filter(lambda x: len(x) >= MIN_RATINGS) print("Mapping original user and item IDs to new sequential IDs") df[USER_COLUMN] = pd.factorize(df[USER_COLUMN])[0] df[ITEM_COLUMN] = pd.factorize(df[ITEM_COLUMN])[0] print("Creating list of items for each user") # Need to sort before popping to get last item df.sort_values(by='timestamp', inplace=True) # clean up data del df['rating'], df['timestamp'] df = df.drop_duplicates() # assuming it keeps order # now we have filtered and sorted by time data, we can split test data out grouped_sorted = df.groupby(USER_COLUMN, group_keys=False) test_data = grouped_sorted.tail(1).sort_values(by='user_id') # need to pop for each group train_data = grouped_sorted.apply(lambda x: x.iloc[:-1]) # Note: no way to keep reference training data ordering because use of python set and multi-process # It should not matter since it will be later randomized again # save train and val data that is fixed. train_ratings = torch.from_numpy(train_data.values) torch.save(train_ratings, args.output + '/train_ratings.pt') test_ratings = torch.from_numpy(test_data.values) torch.save(test_ratings, args.output + '/test_ratings.pt')
def val_epoch(model, x, y, dup_mask, real_indices, K, samples_per_user, num_user, output=None, epoch=None, distributed=False): start = datetime.now() log_2 = math.log(2) model.eval() with torch.no_grad(): p = [] for u,n in zip(x,y): p.append(model(u, n, sigmoid=True).detach()) del x del y temp = torch.cat(p).view(-1,samples_per_user) del p # set duplicate results for the same item to -1 before topk temp[dup_mask] = -1 out = torch.topk(temp,K)[1] # topk in pytorch is stable(if not sort) # key(item):value(predicetion) pairs are ordered as original key(item) order # so we need the first position of real item(stored in real_indices) to check if it is in topk ifzero = (out == real_indices.view(-1,1)) hits = ifzero.sum() ndcg = (log_2 / (torch.nonzero(ifzero)[:,1].view(-1).to(torch.float)+2).log_()).sum() mlperf_log.ncf_print(key=mlperf_log.EVAL_SIZE, value={"epoch": epoch, "value": num_user * samples_per_user}) mlperf_log.ncf_print(key=mlperf_log.EVAL_HP_NUM_USERS, value=num_user) mlperf_log.ncf_print(key=mlperf_log.EVAL_HP_NUM_NEG, value=samples_per_user - 1) end = datetime.now() if distributed: torch.distributed.all_reduce(hits, op=torch.distributed.reduce_op.SUM) torch.distributed.all_reduce(ndcg, op=torch.distributed.reduce_op.SUM) hits = hits.item() ndcg = ndcg.item() if output is not None: result = OrderedDict() result['timestamp'] = datetime.now() result['duration'] = end - start result['epoch'] = epoch result['K'] = K result['hit_rate'] = hits/num_user result['NDCG'] = ndcg/num_user utils.save_result(result, output) return hits/num_user, ndcg/num_user
def main(): args = parse_args() if args.seed is not None: print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) np.random.seed(seed=args.seed) # Save configuration to file config = {k: v for k, v in args.__dict__.items()} config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp()) config['local_timestamp'] = str(datetime.now()) run_dir = "./run/neumf/{}".format(config['timestamp']) print("Saving config and results to {}".format(run_dir)) if not os.path.exists(run_dir) and run_dir != '': os.makedirs(run_dir) utils.save_config(config, run_dir) # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() # Check where to put data loader if use_cuda: dataloader_device = 'cpu' if args.cpu_dataloader else 'cuda' else: dataloader_device = 'cpu' # more like load trigger timmer now mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_NUM_EVAL, value=args.valid_negative) # The default of np.random.choice is replace=True, so does pytorch random_() mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_SAMPLE_EVAL_REPLACEMENT, value=True) mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT, value=True) mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_EVAL_NEG_GEN) # sync worker before timing. torch.cuda.synchronize() #=========================================================================== #== The clock starts on loading the preprocessed data. ===================== #=========================================================================== mlperf_log.ncf_print(key=mlperf_log.RUN_START) run_start_time = time.time() print(datetime.now(), "Loading test ratings.") test_ratings = [torch.LongTensor()] * args.user_scaling for chunk in range(args.user_scaling): test_ratings[chunk] = torch.from_numpy( np.load(args.data + '/testx' + str(args.user_scaling) + 'x' + str(args.item_scaling) + '_' + str(chunk) + '.npz', encoding='bytes')['arr_0']) fn_prefix = args.data + '/' + CACHE_FN.format(args.user_scaling, args.item_scaling) sampler_cache = fn_prefix + "cached_sampler.pkl" print(datetime.now(), "Loading preprocessed sampler.") if os.path.exists(args.data): print("Using alias file: {}".format(args.data)) with open(sampler_cache, "rb") as f: sampler, pos_users, pos_items, nb_items, _ = pickle.load(f) print(datetime.now(), "Alias table loaded.") nb_users = len(sampler.num_regions) train_users = torch.from_numpy(pos_users).type(torch.LongTensor) train_items = torch.from_numpy(pos_items).type(torch.LongTensor) mlperf_log.ncf_print(key=mlperf_log.INPUT_SIZE, value=len(train_users)) # produce things not change between epoch # mask for filtering duplicates with real sample # note: test data is removed before create mask, same as reference # create label train_label = torch.ones_like(train_users, dtype=torch.float32) neg_label = torch.zeros_like(train_label, dtype=torch.float32) neg_label = neg_label.repeat(args.negative_samples) train_label = torch.cat((train_label, neg_label)) del neg_label test_pos = [l[:, 1].reshape(-1, 1) for l in test_ratings] test_negatives = [torch.LongTensor()] * args.user_scaling test_neg_items = [torch.LongTensor()] * args.user_scaling print(datetime.now(), "Loading test negatives.") for chunk in range(args.user_scaling): file_name = (args.data + '/test_negx' + str(args.user_scaling) + 'x' + str(args.item_scaling) + '_' + str(chunk) + '.npz') raw_data = np.load(file_name, encoding='bytes') test_negatives[chunk] = torch.from_numpy(raw_data['arr_0']) print( datetime.now(), "Test negative chunk {} of {} loaded ({} users).".format( chunk + 1, args.user_scaling, test_negatives[chunk].size())) test_neg_items = [l[:, 1] for l in test_negatives] # create items with real sample at last position test_items = [ torch.cat((a.reshape(-1, args.valid_negative), b), dim=1) for a, b in zip(test_neg_items, test_pos) ] del test_ratings, test_neg_items # generate dup mask and real indice for exact same behavior on duplication compare to reference # here we need a sort that is stable(keep order of duplicates) # this is a version works on integer sorted_items, indices = zip(*[torch.sort(l) for l in test_items]) # [1,1,1,2], [3,1,0,2] sum_item_indices = [ a.float() + b.float() / len(b[0]) for a, b in zip(sorted_items, indices) ] #[1.75,1.25,1.0,2.5] indices_order = [torch.sort(l)[1] for l in sum_item_indices] #[2,1,0,3] stable_indices = [ torch.gather(a, 1, b) for a, b in zip(indices, indices_order) ] #[0,1,3,2] # produce -1 mask dup_mask = [(l[:, 0:-1] == l[:, 1:]) for l in sorted_items] dup_mask = [ torch.cat((torch.zeros_like(a, dtype=torch.uint8), b), dim=1) for a, b in zip(test_pos, dup_mask) ] dup_mask = [ torch.gather(a, 1, b.sort()[1]) for a, b in zip(dup_mask, stable_indices) ] # produce real sample indices to later check in topk sorted_items, indices = zip(*[(a != b).sort() for a, b in zip(test_items, test_pos)]) sum_item_indices = [(a.float()) + (b.float()) / len(b[0]) for a, b in zip(sorted_items, indices)] indices_order = [torch.sort(l)[1] for l in sum_item_indices] stable_indices = [ torch.gather(a, 1, b) for a, b in zip(indices, indices_order) ] real_indices = [l[:, 0] for l in stable_indices] del sorted_items, indices, sum_item_indices, indices_order, stable_indices, test_pos # For our dataset, test set is identical to user set, so arange() provides # all test users. test_users = torch.arange(nb_users, dtype=torch.long) test_users = test_users[:, None] test_users = test_users + torch.zeros(1 + args.valid_negative, dtype=torch.long) # test_items needs to be of type Long in order to be used in embedding test_items = torch.cat(test_items).type(torch.long) dup_mask = torch.cat(dup_mask) real_indices = torch.cat(real_indices) # make pytorch memory behavior more consistent later torch.cuda.empty_cache() mlperf_log.ncf_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) mlperf_log.ncf_print( key=mlperf_log.INPUT_ORDER) # we shuffled later with randperm print( datetime.now(), "Data loading done {:.1f} sec. #user={}, #item={}, #train={}, #test={}" .format(time.time() - run_start_time, nb_users, nb_items, len(train_users), nb_users)) # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mf_reg=0., mlp_layer_sizes=args.layers, mlp_layer_regs=[0. for i in args.layers]) print(model) print("{} parameters".format(utils.count_parameters(model))) # Save model text description with open(os.path.join(run_dir, 'model.txt'), 'w') as file: file.write(str(model)) # Add optimizer and loss to graph params = model.parameters() optimizer = torch.optim.Adam(params, lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps) criterion = nn.BCEWithLogitsLoss( reduction='none' ) # use torch.mean() with dim later to avoid copy to host mlperf_log.ncf_print(key=mlperf_log.OPT_LR, value=args.learning_rate) mlperf_log.ncf_print(key=mlperf_log.OPT_NAME, value="Adam") mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=args.beta1) mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=args.beta2) mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=args.eps) mlperf_log.ncf_print(key=mlperf_log.MODEL_HP_LOSS_FN, value=mlperf_log.BCE) if use_cuda: # Move model and loss to GPU model = model.cuda() criterion = criterion.cuda() local_batch = args.batch_size traced_criterion = torch.jit.trace( criterion.forward, (torch.rand(local_batch, 1), torch.rand(local_batch, 1))) # Create files for tracking training valid_results_file = os.path.join(run_dir, 'valid_results.csv') # Calculate initial Hit Ratio and NDCG samples_per_user = test_items.size(1) users_per_valid_batch = args.valid_batch_size // samples_per_user test_users = test_users.split(users_per_valid_batch) test_items = test_items.split(users_per_valid_batch) dup_mask = dup_mask.split(users_per_valid_batch) real_indices = real_indices.split(users_per_valid_batch) hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk, samples_per_user=samples_per_user, num_user=nb_users) print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format( K=args.topk, hit_rate=hr, ndcg=ndcg)) success = False mlperf_log.ncf_print(key=mlperf_log.TRAIN_LOOP) for epoch in range(args.epochs): mlperf_log.ncf_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_NUM_NEG, value=args.negative_samples) mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_TRAIN_NEG_GEN) begin = time.time() st = timeit.default_timer() if args.random_negatives: neg_users = train_users.repeat(args.negative_samples) neg_items = torch.empty_like(neg_users, dtype=torch.int64).random_( 0, nb_items) else: negatives = generate_negatives(sampler, args.negative_samples, train_users.numpy()) negatives = torch.from_numpy(negatives) neg_users = negatives[:, 0] neg_items = negatives[:, 1] print("generate_negatives loop time: {:.2f}", timeit.default_timer() - st) after_neg_gen = time.time() st = timeit.default_timer() epoch_users = torch.cat((train_users, neg_users)) epoch_items = torch.cat((train_items, neg_items)) del neg_users, neg_items # shuffle prepared data and split into batches epoch_indices = torch.randperm(len(epoch_users), device=dataloader_device) epoch_size = len(epoch_indices) epoch_users = epoch_users[epoch_indices] epoch_items = epoch_items[epoch_indices] epoch_label = train_label[epoch_indices] epoch_users_list = epoch_users.split(local_batch) epoch_items_list = epoch_items.split(local_batch) epoch_label_list = epoch_label.split(local_batch) print("shuffle time: {:.2f}", timeit.default_timer() - st) # only print progress bar on rank 0 num_batches = (epoch_size + args.batch_size - 1) // args.batch_size qbar = tqdm.tqdm(range(num_batches)) # handle extremely rare case where last batch size < number of worker if len(epoch_users_list) < num_batches: print("epoch_size % batch_size < number of worker!") exit(1) after_shuffle = time.time() neg_gen_time = (after_neg_gen - begin) shuffle_time = (after_shuffle - after_neg_gen) for i in qbar: # selecting input from prepared data user = epoch_users_list[i].cuda() item = epoch_items_list[i].cuda() label = epoch_label_list[i].view(-1, 1).cuda() for p in model.parameters(): p.grad = None outputs = model(user, item) loss = traced_criterion(outputs, label).float() loss = torch.mean(loss.view(-1), 0) loss.backward() optimizer.step() del epoch_users, epoch_items, epoch_label, epoch_users_list, epoch_items_list, epoch_label_list, user, item, label train_time = time.time() - begin begin = time.time() mlperf_log.ncf_print(key=mlperf_log.EVAL_START, value=epoch) hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk, samples_per_user=samples_per_user, num_user=nb_users, output=valid_results_file, epoch=epoch, loss=loss.data.item()) val_time = time.time() - begin print( 'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},' ' train_time = {train_time:.2f}, val_time = {val_time:.2f}, loss = {loss:.4f},' ' neg_gen: {neg_gen_time:.4f}, shuffle_time: {shuffle_time:.2f}'. format(epoch=epoch, K=args.topk, hit_rate=hr, ndcg=ndcg, train_time=train_time, val_time=val_time, loss=loss.data.item(), neg_gen_time=neg_gen_time, shuffle_time=shuffle_time)) mlperf_log.ncf_print(key=mlperf_log.EVAL_ACCURACY, value={ "epoch": epoch, "value": hr }) mlperf_log.ncf_print(key=mlperf_log.EVAL_TARGET, value=args.threshold) mlperf_log.ncf_print(key=mlperf_log.EVAL_STOP, value=epoch) if args.threshold is not None: if hr >= args.threshold: print("Hit threshold of {}".format(args.threshold)) success = True break mlperf_log.ncf_print(key=mlperf_log.RUN_STOP, value={"success": success}) run_stop_time = time.time() mlperf_log.ncf_print(key=mlperf_log.RUN_FINAL) # easy way of tracking mlperf score if success: print("mlperf_score", run_stop_time - run_start_time)
def main(): # Note: The run start is in data_preprocess.py args = parse_args() if args.seed is not None: print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) np.random.seed(seed=args.seed) # Save configuration to file config = {k: v for k, v in args.__dict__.items()} config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp()) config['local_timestamp'] = str(datetime.now()) run_dir = "./run/MGPM/{}/{}".format( os.path.basename(os.path.normpath(args.data)), config['timestamp']) print("Saving config and results to {}".format(run_dir)) if not os.path.exists(run_dir) and run_dir != '': os.makedirs(run_dir) utils.save_config(config, run_dir) # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() if use_cuda: print("Using cuda ...") else: print("Using CPU ...") t1 = time.time() best_hit, best_ndcg = 0., 0. start_epoch = 0 # start from epoch 0 or last checkpoint epoch # Load Data print('Loading data') print(os.path.join(args.data, TRAIN_RATINGS_FILENAME)) train_dataset = CFTrainDataset( os.path.join(args.data, TRAIN_RATINGS_FILENAME), os.path.join(args.data, DATA_SUMMARY_FILENAME), args.negative_samples) mlperf_log.ncf_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) mlperf_log.ncf_print( key=mlperf_log.INPUT_ORDER) # set shuffle=True in DataLoader train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) test_ratings = load_test_ratings( os.path.join(args.data, TEST_RATINGS_FILENAME)) # noqa: E501 test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME)) nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d' % (time.time() - t1, nb_users, nb_items, train_dataset.mat.nnz, len(test_ratings))) # Create model model = Multi_Preference_Model(nb_users=nb_users, nb_items=nb_items, embed_dim=32, history_size=9) print(model) print("{} parameters".format(utils.count_parameters(model))) # Save model text description with open(os.path.join(run_dir, 'model.txt'), 'w') as file: file.write(str(model)) # Add optimizer and loss to graph mlperf_log.ncf_print(key=mlperf_log.OPT_LR, value=args.learning_rate) beta1, beta2, epsilon = 0.9, 0.999, 1e-8 mlperf_log.ncf_print(key=mlperf_log.OPT_NAME, value="Adam") mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=beta1) mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=beta2) mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=epsilon) optimizer = torch.optim.Adam(model.parameters(), betas=(beta1, beta2), lr=args.learning_rate, eps=epsilon) mlperf_log.ncf_print(key=mlperf_log.MODEL_HP_LOSS_FN, value=mlperf_log.BCE) # optimizer = torch.optim.SGD(model.parameters(),lr=args.learning_rate,momentum=0.9) criterion = nn.BCEWithLogitsLoss() if use_cuda: # Move model and loss to GPU model = model.cuda() criterion = criterion.cuda() if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isdir( 'checkpoint'), 'Error: no checkpoint directory found!' checkpoint = torch.load('./checkpoint/' + model._get_name() + '.pd') model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) start_epoch = checkpoint['epoch'] best_hit = checkpoint['hit'] best_ndcg = checkpoint['ndcg'] # Create files for tracking training valid_results_file = os.path.join(run_dir, 'valid_results.csv') # Calculate initial Hit Ratio and NDCG if start_epoch == 0: hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda, processes=args.processes) print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format( K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs))) mlperf_log.ncf_print(key=mlperf_log.TRAIN_LOOP) for epoch in range(start_epoch, args.epochs): mlperf_log.ncf_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) model.train() losses = utils.AverageMeter() mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_NUM_NEG, value=train_dataset.nb_neg) mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_TRAIN_NEG_GEN) begin = time.time() loader = tqdm.tqdm(train_dataloader) for batch_index, (user, item, history, label) in enumerate(loader): user = torch.autograd.Variable(user, requires_grad=False) item = torch.autograd.Variable(item, requires_grad=False) history = torch.autograd.Variable(history, requires_grad=False) label = torch.autograd.Variable(label, requires_grad=False) if use_cuda: user = user.cuda() item = item.cuda() history = history.cuda() label = label.cuda() # outputs, _ = model(user, item,history) outputs = model(user, item, history) loss = criterion(outputs, label) losses.update(loss.data.item(), user.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() # Save stats to file description = ( 'Epoch {} Loss {loss.val:.4f} ({loss.avg:.4f})'.format( epoch, loss=losses)) loader.set_description(description) train_time = time.time() - begin begin = time.time() hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda, output=valid_results_file, epoch=epoch, processes=args.processes) mlperf_log.ncf_print(key=mlperf_log.EVAL_ACCURACY, value={ "epoch": epoch, "value": float(np.mean(hits)) }) mlperf_log.ncf_print(key=mlperf_log.EVAL_STOP) val_time = time.time() - begin print( 'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},' ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format( epoch=epoch, K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs), train_time=train_time, val_time=val_time)) if np.mean(hits) >= best_hit or np.mean(ndcgs) >= best_ndcg: best_hit = np.mean(hits) best_ndcg = np.mean(ndcgs) # Save checkpoint. print('Saving checkpoint..') state = { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'hit': best_hit, 'ndcg': best_ndcg, } if not os.path.isdir('checkpoint'): os.mkdir('checkpoint') torch.save(state, './checkpoint/' + model._get_name() + '.pd') print("Best hit: ", best_hit) print("Best_ndcg: ", best_ndcg) mlperf_log.ncf_print(key=mlperf_log.RUN_STOP) mlperf_log.ncf_print(key=mlperf_log.RUN_FINAL)
def main(): # Note: The run start is in convert.py args = parse_args() if args.seed is not None: print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) np.random.seed(seed=args.seed) # Save configuration to file config = {k: v for k, v in args.__dict__.items()} config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp()) config['local_timestamp'] = str(datetime.now()) # File is located in a subdirectory of the data directory run_dir = os.path.join(args.data, "run_neumf_{}".format(config['timestamp'])) print("Saving config and results to {}".format(run_dir)) if not os.path.exists(run_dir) and run_dir != '': os.makedirs(run_dir) utils.save_config(config, run_dir) # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() t1 = time.time() # Load Data print('Loading data') train_dataset = CFTrainDataset( os.path.join(args.data, TRAIN_RATINGS_FILENAME), args.negative_samples) mlperf_log.ncf_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) # set shuffle=True in DataLoader mlperf_log.ncf_print(key=mlperf_log.INPUT_ORDER) train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) test_ratings = load_test_ratings( os.path.join(args.data, TEST_RATINGS_FILENAME)) # noqa: E501 test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME)) nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d' % (time.time() - t1, nb_users, nb_items, train_dataset.mat.nnz, len(test_ratings))) # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mf_reg=0., mlp_layer_sizes=args.layers, mlp_layer_regs=[0. for i in args.layers]) print(model) print("{} parameters".format(utils.count_parameters(model))) # Save model text description with open(os.path.join(run_dir, 'model.txt'), 'w') as file: file.write(str(model)) # Add optimizer and loss to graph mlperf_log.ncf_print(key=mlperf_log.OPT_LR, value=args.learning_rate) beta1, beta2, epsilon = 0.9, 0.999, 1e-8 mlperf_log.ncf_print(key=mlperf_log.OPT_NAME, value="Adam") mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=beta1) mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=beta2) mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=epsilon) optimizer = torch.optim.Adam(model.parameters(), betas=(beta1, beta2), lr=args.learning_rate, eps=epsilon) mlperf_log.ncf_print(key=mlperf_log.MODEL_HP_LOSS_FN, value=mlperf_log.BCE) criterion = nn.BCEWithLogitsLoss() if use_cuda: # Move model and loss to GPU model = model.cuda() criterion = criterion.cuda() # Create files for tracking training valid_results_file = os.path.join(run_dir, 'valid_results.csv') # Calculate initial Hit Ratio and NDCG hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda, processes=args.processes) print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format( K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs))) success = False mlperf_log.ncf_print(key=mlperf_log.TRAIN_LOOP) for epoch in range(args.epochs): mlperf_log.ncf_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) model.train() losses = utils.AverageMeter() mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_NUM_NEG, value=train_dataset.nb_neg) mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_TRAIN_NEG_GEN) begin = time.time() loader = tqdm.tqdm(train_dataloader, disable=args.progress_bar) for batch_index, (user, item, label) in enumerate(loader): user = torch.autograd.Variable(user, requires_grad=False) item = torch.autograd.Variable(item, requires_grad=False) label = torch.autograd.Variable(label, requires_grad=False) if use_cuda: user = user.cuda() item = item.cuda() label = label.cuda() outputs = model(user, item) loss = criterion(outputs, label) losses.update(loss.data.item(), user.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() # Save stats to file description = ( 'Epoch {} Loss {loss.val:.4f} ({loss.avg:.4f})'.format( epoch, loss=losses)) loader.set_description(description) train_time = time.time() - begin begin = time.time() hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda, output=valid_results_file, epoch=epoch, processes=args.processes) mlperf_log.ncf_print(key=mlperf_log.EVAL_ACCURACY, value={ "epoch": epoch, "value": float(np.mean(hits)) }) mlperf_log.ncf_print(key=mlperf_log.EVAL_TARGET, value=args.threshold) mlperf_log.ncf_print(key=mlperf_log.EVAL_STOP) val_time = time.time() - begin print( 'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},' ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format( epoch=epoch, K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs), train_time=train_time, val_time=val_time)) if args.threshold is not None: if np.mean(hits) >= args.threshold: print("Hit threshold of {}".format(args.threshold)) success = True break mlperf_log.ncf_print(key=mlperf_log.RUN_STOP, value={"success": success}) mlperf_log.ncf_print(key=mlperf_log.RUN_FINAL)
def main(): args = parse_args() np.random.seed(args.seed) print("Loading raw data from {}".format(args.path)) df = implicit_load(args.path, sort=False) print("Filtering out users with less than {} ratings".format(MIN_RATINGS)) grouped = df.groupby(USER_COLUMN) mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_MIN_RATINGS, value=MIN_RATINGS) df = grouped.filter(lambda x: len(x) >= MIN_RATINGS) print("Mapping original user and item IDs to new sequential IDs") original_users = df[USER_COLUMN].unique() original_items = df[ITEM_COLUMN].unique() user_map = {user: index for index, user in enumerate(original_users)} item_map = {item: index for index, item in enumerate(original_items)} df[USER_COLUMN] = df[USER_COLUMN].apply(lambda user: user_map[user]) df[ITEM_COLUMN] = df[ITEM_COLUMN].apply(lambda item: item_map[item]) assert df[USER_COLUMN].max() == len(original_users) - 1 assert df[ITEM_COLUMN].max() == len(original_items) - 1 print("Creating list of items for each user") # Need to sort before popping to get last item df.sort_values(by='timestamp', inplace=True) all_ratings = set(zip(df[USER_COLUMN], df[ITEM_COLUMN])) user_to_items = defaultdict(list) for row in tqdm(df.itertuples(), desc='Ratings', total=len(df)): user_to_items[getattr(row, USER_COLUMN)].append( getattr(row, ITEM_COLUMN)) # noqa: E501 test_ratings = [] test_negs = [] all_items = set(range(len(original_items))) print("Generating {} negative samples for each user".format( args.negatives)) mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_NUM_EVAL, value=args.negatives) # The default of np.random.choice is replace=True mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_SAMPLE_EVAL_REPLACEMENT, value=True) #=========================================================================== #== First random operation triggers the clock start. ======================= #=========================================================================== mlperf_log.ncf_print(key=mlperf_log.RUN_START) mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_EVAL_NEG_GEN) for user in tqdm(range(len(original_users)), desc='Users', total=len(original_users)): # noqa: E501 test_item = user_to_items[user].pop() all_ratings.remove((user, test_item)) all_negs = all_items - set(user_to_items[user]) all_negs = sorted(list(all_negs)) # determinism test_ratings.append((user, test_item)) test_negs.append(list(np.random.choice(all_negs, args.negatives))) print("Saving train and test CSV files to {}".format(args.output)) df_train_ratings = pd.DataFrame(list(all_ratings)) df_train_ratings['fake_rating'] = 1 df_train_ratings.to_csv(os.path.join(args.output, TRAIN_RATINGS_FILENAME), index=False, header=False, sep='\t') mlperf_log.ncf_print(key=mlperf_log.INPUT_SIZE, value=len(df_train_ratings)) df_test_ratings = pd.DataFrame(test_ratings) df_test_ratings['fake_rating'] = 1 df_test_ratings.to_csv(os.path.join(args.output, TEST_RATINGS_FILENAME), index=False, header=False, sep='\t') df_test_negs = pd.DataFrame(test_negs) df_test_negs.to_csv(os.path.join(args.output, TEST_NEG_FILENAME), index=False, header=False, sep='\t')
def main(): args = parse_args() args.distributed, args.world_size = init_distributed(args.local_rank) if args.seed is not None: print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) # Save configuration to file config = {k: v for k, v in args.__dict__.items()} config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp()) config['local_timestamp'] = str(datetime.now()) run_dir = "./run/neumf/{}.{}".format(config['timestamp'],args.local_rank) print("Saving config and results to {}".format(run_dir)) if not os.path.exists(run_dir) and run_dir != '': os.makedirs(run_dir) utils.save_config(config, run_dir) # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() # more like load trigger timmer now mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_NUM_EVAL, value=args.valid_negative) # The default of np.random.choice is replace=True, so does pytorch random_() mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_SAMPLE_EVAL_REPLACEMENT, value=True) mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT) mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_EVAL_NEG_GEN) # sync worker before timing. if args.distributed: torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0) torch.cuda.synchronize() #=========================================================================== #== The clock starts on loading the preprocessed data. ===================== #=========================================================================== mlperf_log.ncf_print(key=mlperf_log.RUN_START) run_start_time = time.time() # load not converted data, just seperate one for test train_ratings = torch.load(args.data+'/train_ratings.pt', map_location=torch.device('cuda:{}'.format(args.local_rank))) test_ratings = torch.load(args.data+'/test_ratings.pt', map_location=torch.device('cuda:{}'.format(args.local_rank))) # get input data # get dims nb_maxs = torch.max(train_ratings, 0)[0] nb_users = nb_maxs[0].item()+1 nb_items = nb_maxs[1].item()+1 train_users = train_ratings[:,0] train_items = train_ratings[:,1] del nb_maxs, train_ratings mlperf_log.ncf_print(key=mlperf_log.INPUT_SIZE, value=len(train_users)) # produce things not change between epoch # mask for filtering duplicates with real sample # note: test data is removed before create mask, same as reference mat = torch.cuda.ByteTensor(nb_users, nb_items).fill_(1) mat[train_users, train_items] = 0 # create label train_label = torch.ones_like(train_users, dtype=torch.float32) neg_label = torch.zeros_like(train_label, dtype=torch.float32) neg_label = neg_label.repeat(args.negative_samples) train_label = torch.cat((train_label,neg_label)) del neg_label if args.fp16: train_label = train_label.half() # produce validation negative sample on GPU all_test_users = test_ratings.shape[0] test_users = test_ratings[:,0] test_pos = test_ratings[:,1].reshape(-1,1) test_negs = generate_neg(test_users, mat, nb_items, args.valid_negative, True)[1] # create items with real sample at last position test_users = test_users.reshape(-1,1).repeat(1,1+args.valid_negative) test_items = torch.cat((test_negs.reshape(-1,args.valid_negative), test_pos), dim=1) del test_ratings, test_negs # generate dup mask and real indice for exact same behavior on duplication compare to reference # here we need a sort that is stable(keep order of duplicates) # this is a version works on integer sorted_items, indices = torch.sort(test_items) # [1,1,1,2], [3,1,0,2] sum_item_indices = sorted_items.float()+indices.float()/len(indices[0]) #[1.75,1.25,1.0,2.5] indices_order = torch.sort(sum_item_indices)[1] #[2,1,0,3] stable_indices = torch.gather(indices, 1, indices_order) #[0,1,3,2] # produce -1 mask dup_mask = (sorted_items[:,0:-1] == sorted_items[:,1:]) dup_mask = torch.cat((torch.zeros_like(test_pos, dtype=torch.uint8), dup_mask),dim=1) dup_mask = torch.gather(dup_mask,1,stable_indices.sort()[1]) # produce real sample indices to later check in topk sorted_items, indices = (test_items != test_pos).sort() sum_item_indices = sorted_items.float()+indices.float()/len(indices[0]) indices_order = torch.sort(sum_item_indices)[1] stable_indices = torch.gather(indices, 1, indices_order) real_indices = stable_indices[:,0] del sorted_items, indices, sum_item_indices, indices_order, stable_indices, test_pos if args.distributed: test_users = torch.chunk(test_users, args.world_size)[args.local_rank] test_items = torch.chunk(test_items, args.world_size)[args.local_rank] dup_mask = torch.chunk(dup_mask, args.world_size)[args.local_rank] real_indices = torch.chunk(real_indices, args.world_size)[args.local_rank] # make pytorch memory behavior more consistent later torch.cuda.empty_cache() mlperf_log.ncf_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) mlperf_log.ncf_print(key=mlperf_log.INPUT_ORDER) # we shuffled later with randperm print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d' % (time.time()-run_start_time, nb_users, nb_items, len(train_users), nb_users)) # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mf_reg=0., mlp_layer_sizes=args.layers, mlp_layer_regs=[0. for i in args.layers]) if args.fp16: model = model.half() print(model) print("{} parameters".format(utils.count_parameters(model))) # Save model text description with open(os.path.join(run_dir, 'model.txt'), 'w') as file: file.write(str(model)) # Add optimizer and loss to graph if args.fp16: fp_optimizer = Fp16Optimizer(model, args.loss_scale) params = fp_optimizer.fp32_params else: params = model.parameters() #optimizer = torch.optim.Adam(params, lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps) # optimizer = AdamOpt(params, lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps) optimizer = FusedAdam(params, lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps, eps_inside_sqrt=False) criterion = nn.BCEWithLogitsLoss(reduction = 'none') # use torch.mean() with dim later to avoid copy to host mlperf_log.ncf_print(key=mlperf_log.OPT_LR, value=args.learning_rate) mlperf_log.ncf_print(key=mlperf_log.OPT_NAME, value="Adam") mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=args.beta1) mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=args.beta2) mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=args.eps) mlperf_log.ncf_print(key=mlperf_log.MODEL_HP_LOSS_FN, value=mlperf_log.BCE) if use_cuda: # Move model and loss to GPU model = model.cuda() criterion = criterion.cuda() if args.distributed: model = DDP(model) local_batch = args.batch_size // int(os.environ['WORLD_SIZE']) else: local_batch = args.batch_size traced_criterion = torch.jit.trace(criterion.forward, (torch.rand(local_batch,1),torch.rand(local_batch,1))) # Create files for tracking training valid_results_file = os.path.join(run_dir, 'valid_results.csv') # Calculate initial Hit Ratio and NDCG test_x = test_users.view(-1).split(args.valid_batch_size) test_y = test_items.view(-1).split(args.valid_batch_size) hr, ndcg = val_epoch(model, test_x, test_y, dup_mask, real_indices, args.topk, samples_per_user=test_items.size(1), num_user=all_test_users, distributed=args.distributed) print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}' .format(K=args.topk, hit_rate=hr, ndcg=ndcg)) success = False mlperf_log.ncf_print(key=mlperf_log.TRAIN_LOOP) for epoch in range(args.epochs): mlperf_log.ncf_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_NUM_NEG, value=args.negative_samples) mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_TRAIN_NEG_GEN) begin = time.time() # prepare data for epoch neg_users, neg_items = generate_neg(train_users, mat, nb_items, args.negative_samples) epoch_users = torch.cat((train_users,neg_users)) epoch_items = torch.cat((train_items,neg_items)) del neg_users, neg_items # shuffle prepared data and split into batches epoch_indices = torch.randperm(len(epoch_users), device='cuda:{}'.format(args.local_rank)) epoch_users = epoch_users[epoch_indices] epoch_items = epoch_items[epoch_indices] epoch_label = train_label[epoch_indices] if args.distributed: epoch_users = torch.chunk(epoch_users, args.world_size)[args.local_rank] epoch_items = torch.chunk(epoch_items, args.world_size)[args.local_rank] epoch_label = torch.chunk(epoch_label, args.world_size)[args.local_rank] epoch_users_list = epoch_users.split(local_batch) epoch_items_list = epoch_items.split(local_batch) epoch_label_list = epoch_label.split(local_batch) # only print progress bar on rank 0 num_batches = (len(epoch_indices) + args.batch_size - 1) // args.batch_size if args.local_rank == 0: qbar = tqdm.tqdm(range(num_batches)) else: qbar = range(num_batches) # handle extremely rare case where last batch size < number of worker if len(epoch_users_list) < num_batches: print("epoch_size % batch_size < number of worker!") exit(1) for i in qbar: # selecting input from prepared data user = epoch_users_list[i] item = epoch_items_list[i] label = epoch_label_list[i].view(-1,1) for p in model.parameters(): p.grad = None outputs = model(user, item) loss = traced_criterion(outputs, label).float() loss = torch.mean(loss.view(-1), 0) if args.fp16: fp_optimizer.step(loss, optimizer) else: loss.backward() optimizer.step() del epoch_users, epoch_items, epoch_label, epoch_users_list, epoch_items_list, epoch_label_list, user, item, label train_time = time.time() - begin begin = time.time() mlperf_log.ncf_print(key=mlperf_log.EVAL_START) hr, ndcg = val_epoch(model, test_x, test_y, dup_mask, real_indices, args.topk, samples_per_user=test_items.size(1), num_user=all_test_users, output=valid_results_file, epoch=epoch, distributed=args.distributed) val_time = time.time() - begin print('Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},' ' train_time = {train_time:.2f}, val_time = {val_time:.2f}' .format(epoch=epoch, K=args.topk, hit_rate=hr, ndcg=ndcg, train_time=train_time, val_time=val_time)) mlperf_log.ncf_print(key=mlperf_log.EVAL_ACCURACY, value={"epoch": epoch, "value": hr}) mlperf_log.ncf_print(key=mlperf_log.EVAL_TARGET, value=args.threshold) mlperf_log.ncf_print(key=mlperf_log.EVAL_STOP) if args.threshold is not None: if hr >= args.threshold: print("Hit threshold of {}".format(args.threshold)) success = True break mlperf_log.ncf_print(key=mlperf_log.RUN_STOP, value={"success": success}) run_stop_time = time.time() mlperf_log.ncf_print(key=mlperf_log.RUN_FINAL) # easy way of tracking mlperf score if success: print("mlperf_score", run_stop_time - run_start_time)
def main(): args = parse_args() np.random.seed(args.seed) print("Loading raw data from {}".format(args.file)) #-------------- MovieLens dataset ------------------------------ # df = implicit_load(args.file, sort=False) #--------------------------------------------------------------- #------ retailrocket-recommender-system-dataset -------------------- # df = pd.read_csv(args.file, sep=',', header=0) # df.columns = ['timestamp', 'user_id', 'event', 'item_id', 'transaction_id'] # df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms') # # # RatingData = namedtuple('RatingData', # ['items', 'users', 'ratings', 'min_date', 'max_date']) # info = RatingData(items=len(df['item_id'].unique()), # users=len(df['user_id'].unique()), # ratings=len(df), # min_date=df['timestamp'].min(), # max_date=df['timestamp'].max()) # print("{ratings} ratings on {items} items from {users} users" # " from {min_date} to {max_date}" # .format(**(info._asdict()))) # #-------------------------------------------------------------------- #-------------------amazon dataset------------------------ # df = pd.read_csv(args.file, sep=',', header=None) # df.columns = ['user_id', 'item_id', 'rating', 'timestamp'] # df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s') # # RatingData = namedtuple('RatingData', # ['items', 'users', 'ratings', 'min_date', 'max_date']) # info = RatingData(items=len(df['item_id'].unique()), # users=len(df['user_id'].unique()), # ratings=len(df), # min_date=df['timestamp'].min(), # max_date=df['timestamp'].max()) # print("{ratings} ratings on {items} items from {users} users" # " from {min_date} to {max_date}" # .format(**(info._asdict()))) #------------------------------------------------------------------------- #------------------- hetrec2011 dataset------------------------ # df = pd.read_csv(args.file, sep='\t', header=0) # df.columns = ['user_id', 'item_id', 'tag_id', 'timestamp'] # df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms') # # RatingData = namedtuple('RatingData', # ['items', 'users', 'ratings', 'min_date', 'max_date']) # info = RatingData(items=len(df['item_id'].unique()), # users=len(df['user_id'].unique()), # ratings=len(df), # min_date=df['timestamp'].min(), # max_date=df['timestamp'].max()) # print("{ratings} ratings on {items} items from {users} users" # " from {min_date} to {max_date}" # .format(**(info._asdict()))) # #------------------------------------------------------------------------- #------------------- taobao UserBehavior dataset------------------------ df = pd.read_csv(args.file, sep=',', header=None) df.columns = [ 'user_id', 'item_id', 'category_id', 'behavior_type', 'timestamp' ] df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s') RatingData = namedtuple( 'RatingData', ['items', 'users', 'ratings', 'min_date', 'max_date']) info = RatingData(items=len(df['item_id'].unique()), users=len(df['user_id'].unique()), ratings=len(df), min_date=df['timestamp'].min(), max_date=df['timestamp'].max()) print("{ratings} ratings on {items} items from {users} users" " from {min_date} to {max_date}".format(**(info._asdict()))) #------------------------------------------------------------------------- print("Filtering out users with less than {} ratings".format(MIN_RATINGS)) grouped = df.groupby(USER_COLUMN) mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_MIN_RATINGS, value=MIN_RATINGS) df = grouped.filter(lambda x: len(x) >= MIN_RATINGS) print("Mapping original user and item IDs to new sequential IDs") original_users = df[USER_COLUMN].unique() original_items = df[ITEM_COLUMN].unique() nb_users = len(original_users) nb_items = len(original_items) user_map = {user: index for index, user in enumerate(original_users)} item_map = {item: index for index, item in enumerate(original_items)} df[USER_COLUMN] = df[USER_COLUMN].apply(lambda user: user_map[user]) df[ITEM_COLUMN] = df[ITEM_COLUMN].apply(lambda item: item_map[item]) # print(df) assert df[USER_COLUMN].max() == len(original_users) - 1 assert df[ITEM_COLUMN].max() == len(original_items) - 1 print("Creating list of items for each user") # Need to sort before popping to get last item df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s') df.sort_values(by='timestamp', inplace=True) all_ratings = set(zip(df[USER_COLUMN], df[ITEM_COLUMN])) user_to_items = defaultdict(list) for row in tqdm(df.itertuples(), desc='Ratings', total=len(df)): user_to_items[getattr(row, USER_COLUMN)].append( getattr(row, ITEM_COLUMN)) # noqa: E501 print(len(user_to_items[0])) print(user_to_items[0]) print(user_to_items[0][-args.history_size:]) print( "Generating {} negative samples for each user and creating training set" .format(args.negatives)) mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_NUM_EVAL, value=args.negatives) train_ratings = [] test_ratings = [] test_negs = [] all_items = set(range(len(original_items))) for key, value in tqdm(user_to_items.items(), total=len(user_to_items)): all_negs = all_items - set(value) all_negs = sorted(list(all_negs)) negs = random.sample(all_negs, args.negatives) test_item = value.pop() tmp = [key, test_item] tmp.extend(negs) test_negs.append(tmp) tmp = [key, test_item] tmp.extend(value[-args.history_size:]) test_ratings.append(tmp) while len(value) > args.history_size: tgItem = value.pop() tmp = [key, tgItem] tmp.extend(value[-args.history_size:]) train_ratings.append(tmp) print("\nSaving train and test CSV files to {}".format(args.output)) df_train_ratings = pd.DataFrame(list(train_ratings)) df_test_ratings = pd.DataFrame(list(test_ratings)) df_test_negs = pd.DataFrame(list(test_negs)) print('Saving data description ...') data_summary = pd.DataFrame( { 'users': nb_users, 'items': nb_items, 'history_size': HISTORY_SIZE, 'train_entries': len(df_train_ratings), 'test': len(df_test_ratings) }, index=[0]) data_summary.to_csv(os.path.join(args.output, DATA_SUMMARY_FILENAME), header=True, index=False, sep=',') df_train_ratings['fake_rating'] = 1 df_train_ratings.to_csv(os.path.join(args.output, TRAIN_RATINGS_FILENAME), index=False, header=False, sep='\t') mlperf_log.ncf_print(key=mlperf_log.INPUT_SIZE, value=len(df_train_ratings)) df_test_ratings['fake_rating'] = 1 df_test_ratings.to_csv(os.path.join(args.output, TEST_RATINGS_FILENAME), index=False, header=False, sep='\t') df_test_negs.to_csv(os.path.join(args.output, TEST_NEG_FILENAME), index=False, header=False, sep='\t')