def __init__(self, train_fname, nb_neg): self._load_train_matrix(train_fname) self.nb_neg = nb_neg mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_TRAIN_NEG_GEN, value=nb_neg) mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT)
def val_epoch(model, ratings, negs, K, use_cuda=True, output=None, epoch=None, processes=1): if epoch is None: print("Initial evaluation") else: print("Epoch {} evaluation".format(epoch)) mlperf_log.ncf_print(key=mlperf_log.EVAL_START, value=epoch) start = datetime.now() model.eval() if processes > 1: context = mp.get_context('spawn') _eval_one = partial(eval_one, model=model, K=K, use_cuda=use_cuda) with context.Pool(processes=processes) as workers: hits_ndcg_numpred = workers.starmap(_eval_one, zip(ratings, negs)) hits, ndcgs, num_preds = zip(*hits_ndcg_numpred) else: hits, ndcgs, num_preds = [], [], [] for rating, items in zip(ratings, negs): hit, ndcg, num_pred = eval_one(rating, items, model, K, use_cuda=use_cuda) hits.append(hit) ndcgs.append(ndcg) num_preds.append(num_pred) hits = np.array(hits, dtype=np.float32) ndcgs = np.array(ndcgs, dtype=np.float32) assert len(set(num_preds)) == 1 num_neg = num_preds[0] - 1 # one true positive, many negatives mlperf_log.ncf_print(key=mlperf_log.EVAL_SIZE, value={ "epoch": epoch, "value": len(hits) * (1 + num_neg) }) mlperf_log.ncf_print(key=mlperf_log.EVAL_HP_NUM_USERS, value=len(hits)) mlperf_log.ncf_print(key=mlperf_log.EVAL_HP_NUM_NEG, value=num_neg) end = datetime.now() if output is not None: result = OrderedDict() result['timestamp'] = datetime.now() result['duration'] = end - start result['epoch'] = epoch result['K'] = K result['hit_rate'] = np.mean(hits) result['NDCG'] = np.mean(ndcgs) utils.save_result(result, output) return hits, ndcgs
def __init__(self, nb_users, nb_items, mf_dim, mf_reg, mlp_layer_sizes, mlp_layer_regs): if len(mlp_layer_sizes) != len(mlp_layer_regs): raise RuntimeError('u dummy, layer_sizes != layer_regs!') if mlp_layer_sizes[0] % 2 != 0: raise RuntimeError('u dummy, mlp_layer_sizes[0] % 2 != 0') super(NeuMF, self).__init__() nb_mlp_layers = len(mlp_layer_sizes) mlperf_log.ncf_print(key=mlperf_log.MODEL_HP_MF_DIM) # TODO: regularization? self.mf_user_embed = nn.Embedding(nb_users, mf_dim) self.mf_item_embed = nn.Embedding(nb_items, mf_dim) self.mlp_user_embed = nn.Embedding(nb_users, mlp_layer_sizes[0] // 2) self.mlp_item_embed = nn.Embedding(nb_items, mlp_layer_sizes[0] // 2) mlperf_log.ncf_print(key=mlperf_log.MODEL_HP_MLP_LAYER_SIZES, value=mlp_layer_sizes) self.mlp = nn.ModuleList() for i in range(1, nb_mlp_layers): self.mlp.extend( [nn.Linear(mlp_layer_sizes[i - 1], mlp_layer_sizes[i])]) # noqa: E501 self.final = nn.Linear(mlp_layer_sizes[-1] + mf_dim, 1) self.mf_user_embed.weight.data.normal_(0., 0.01) self.mf_item_embed.weight.data.normal_(0., 0.01) self.mlp_user_embed.weight.data.normal_(0., 0.01) self.mlp_item_embed.weight.data.normal_(0., 0.01) def golorot_uniform(layer): fan_in, fan_out = layer.in_features, layer.out_features limit = np.sqrt(6. / (fan_in + fan_out)) layer.weight.data.uniform_(-limit, limit) def lecunn_uniform(layer): fan_in, fan_out = layer.in_features, layer.out_features # noqa: F841, E501 limit = np.sqrt(3. / fan_in) layer.weight.data.uniform_(-limit, limit) for layer in self.mlp: if type(layer) != nn.Linear: continue golorot_uniform(layer) lecunn_uniform(self.final)
def main(): # Note: The run start is in convert.py args = parse_args() if args.seed is not None: print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) np.random.seed(seed=args.seed) # Save configuration to file config = {k: v for k, v in args.__dict__.items()} config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp()) config['local_timestamp'] = str(datetime.now()) run_dir = "./run/neumf/{}".format(config['timestamp']) print("Saving config and results to {}".format(run_dir)) if not os.path.exists(run_dir) and run_dir != '': os.makedirs(run_dir) utils.save_config(config, run_dir) # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() t1 = time.time() # Load Data print('Loading data') train_dataset = CFTrainDataset( os.path.join(args.data, TRAIN_RATINGS_FILENAME), args.negative_samples) mlperf_log.ncf_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) mlperf_log.ncf_print( key=mlperf_log.INPUT_ORDER) # set shuffle=True in DataLoader train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) test_ratings = load_test_ratings( os.path.join(args.data, TEST_RATINGS_FILENAME)) # noqa: E501 test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME)) nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d' % (time.time() - t1, nb_users, nb_items, train_dataset.mat.nnz, len(test_ratings))) # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mf_reg=0., mlp_layer_sizes=args.layers, mlp_layer_regs=[0. for i in args.layers]) print(model) print("{} parameters".format(utils.count_parameters(model))) # Save model text description with open(os.path.join(run_dir, 'model.txt'), 'w') as file: file.write(str(model)) # Add optimizer and loss to graph mlperf_log.ncf_print(key=mlperf_log.TRAIN_LEARN_RATE, value=args.learning_rate) beta1, beta2, epsilon = 0.9, 0.999, 1e-8 mlperf_log.ncf_print(key=mlperf_log.OPT_NAME, value="Adam") mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=beta1) mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=beta2) mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=epsilon) optimizer = torch.optim.Adam(model.parameters(), betas=(beta1, beta2), lr=args.learning_rate, eps=epsilon) mlperf_log.ncf_print(key=mlperf_log.MODEL_HP_LOSS_FN, value=mlperf_log.BCE) criterion = nn.BCEWithLogitsLoss() if use_cuda: # Move model and loss to GPU model = model.cuda() criterion = criterion.cuda() # Create files for tracking training valid_results_file = os.path.join(run_dir, 'valid_results.csv') # Calculate initial Hit Ratio and NDCG hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda, processes=args.processes) print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format( K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs))) success = False mlperf_log.ncf_print(key=mlperf_log.TRAIN_LOOP) for epoch in range(args.epochs): mlperf_log.ncf_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) model.train() losses = utils.AverageMeter() mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_NUM_NEG, value=train_dataset.nb_neg) mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_TRAIN_NEG_GEN) begin = time.time() loader = tqdm.tqdm(train_dataloader) for batch_index, (user, item, label) in enumerate(loader): user = torch.autograd.Variable(user, requires_grad=False) item = torch.autograd.Variable(item, requires_grad=False) label = torch.autograd.Variable(label, requires_grad=False) if use_cuda: user = user.cuda(async=True) item = item.cuda(async=True) label = label.cuda(async=True) outputs = model(user, item) loss = criterion(outputs, label) losses.update(loss.data.item(), user.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() # Save stats to file description = ( 'Epoch {} Loss {loss.val:.4f} ({loss.avg:.4f})'.format( epoch, loss=losses)) loader.set_description(description) train_time = time.time() - begin begin = time.time() hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda, output=valid_results_file, epoch=epoch, processes=args.processes) mlperf_log.ncf_print(key=mlperf_log.EVAL_ACCURACY, value={ "epoch": epoch, "value": float(np.mean(hits)) }) mlperf_log.ncf_print(key=mlperf_log.EVAL_TARGET, value=args.threshold) mlperf_log.ncf_print(key=mlperf_log.EVAL_STOP) val_time = time.time() - begin print( 'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},' ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format( epoch=epoch, K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs), train_time=train_time, val_time=val_time)) if args.threshold is not None: if np.mean(hits) >= args.threshold: print("Hit threshold of {}".format(args.threshold)) success = True break mlperf_log.ncf_print(key=mlperf_log.RUN_STOP, value={"success": success}) mlperf_log.ncf_print(key=mlperf_log.RUN_FINAL)
def main(): args = parse_args() np.random.seed(args.seed) print("Loading raw data from {}".format(args.path)) df = implicit_load(args.path, sort=False) print("Filtering out users with less than {} ratings".format(MIN_RATINGS)) grouped = df.groupby(USER_COLUMN) mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_MIN_RATINGS, value=MIN_RATINGS) df = grouped.filter(lambda x: len(x) >= MIN_RATINGS) print("Mapping original user and item IDs to new sequential IDs") original_users = df[USER_COLUMN].unique() original_items = df[ITEM_COLUMN].unique() user_map = {user: index for index, user in enumerate(original_users)} item_map = {item: index for index, item in enumerate(original_items)} df[USER_COLUMN] = df[USER_COLUMN].apply(lambda user: user_map[user]) df[ITEM_COLUMN] = df[ITEM_COLUMN].apply(lambda item: item_map[item]) assert df[USER_COLUMN].max() == len(original_users) - 1 assert df[ITEM_COLUMN].max() == len(original_items) - 1 print("Creating list of items for each user") # Need to sort before popping to get last item df.sort_values(by='timestamp', inplace=True) all_ratings = set(zip(df[USER_COLUMN], df[ITEM_COLUMN])) user_to_items = defaultdict(list) for row in tqdm(df.itertuples(), desc='Ratings', total=len(df)): user_to_items[getattr(row, USER_COLUMN)].append( getattr(row, ITEM_COLUMN)) # noqa: E501 test_ratings = [] test_negs = [] all_items = set(range(len(original_items))) print("Generating {} negative samples for each user".format( args.negatives)) mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_NUM_EVAL, value=args.negatives) # The default of np.random.choice is replace=True mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_SAMPLE_EVAL_REPLACEMENT, value=True) #=========================================================================== #== First random operation triggers the clock start. ======================= #=========================================================================== mlperf_log.ncf_print(key=mlperf_log.RUN_START) mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_EVAL_NEG_GEN) for user in tqdm(range(len(original_users)), desc='Users', total=len(original_users)): # noqa: E501 test_item = user_to_items[user].pop() all_ratings.remove((user, test_item)) all_negs = all_items - set(user_to_items[user]) all_negs = sorted(list(all_negs)) # determinism test_ratings.append((user, test_item)) test_negs.append(list(np.random.choice(all_negs, args.negatives))) print("Saving train and test CSV files to {}".format(args.output)) df_train_ratings = pd.DataFrame(list(all_ratings)) df_train_ratings['fake_rating'] = 1 df_train_ratings.to_csv(os.path.join(args.output, TRAIN_RATINGS_FILENAME), index=False, header=False, sep='\t') mlperf_log.ncf_print(key=mlperf_log.INPUT_SIZE, value=len(df_train_ratings)) df_test_ratings = pd.DataFrame(test_ratings) df_test_ratings['fake_rating'] = 1 df_test_ratings.to_csv(os.path.join(args.output, TEST_RATINGS_FILENAME), index=False, header=False, sep='\t') df_test_negs = pd.DataFrame(test_negs) df_test_negs.to_csv(os.path.join(args.output, TEST_NEG_FILENAME), index=False, header=False, sep='\t')