from BPR import BPR from openrec.utils.evaluators import AUC from openrec.utils.samplers import RandomPairwiseSampler from openrec.utils.samplers import EvaluationSampler import dataloader #raw_data = dataloader.load_citeulike() raw_data = dataloader.load_dataset() dim_embed = CHANGE_DIM_HERE total_iter = raw_data["max_iteration"] batch_size = 1000 eval_iter = total_iter save_iter = eval_iter train_dataset = Dataset(raw_data['train_data'], raw_data['total_users'], raw_data['total_items'], name='Train') val_dataset = Dataset(raw_data['val_data'], raw_data['total_users'], raw_data['total_items'], name='Val', num_negatives=500) test_dataset = Dataset(raw_data['test_data'], raw_data['total_users'], raw_data['total_items'], name='Test', num_negatives=500) train_sampler = RandomPairwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=5)
from openrec import ModelTrainer from openrec.utils import Dataset from BPR import BPR from openrec.utils.evaluators import AUC from openrec.utils.samplers import RandomPairwiseSampler from openrec.utils.samplers import EvaluationSampler import dataloader raw_data = dataloader.load_citeulike() dim_embed = 100 total_iter = 10000 batch_size = 1000 eval_iter = 10000 save_iter = eval_iter train_dataset = Dataset(raw_data['train_data'], raw_data['total_users'], raw_data['total_items'], name='Train') val_dataset = Dataset(raw_data['val_data'], raw_data['total_users'], raw_data['total_items'], name='Val', num_negatives=500) test_dataset = Dataset(raw_data['test_data'], raw_data['total_users'], raw_data['total_items'], name='Test', num_negatives=500) train_sampler = RandomPairwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=5) val_sampler = EvaluationSampler(batch_size=batch_size, dataset=val_dataset) test_sampler = EvaluationSampler(batch_size=batch_size, dataset=test_dataset) bpr_model = BPR(batch_size=batch_size, total_users=train_dataset.total_users(), total_items=train_dataset.total_items(), l2_reg=0.01, dim_user_embed=dim_embed, dim_item_embed=dim_embed, save_model_dir='bpr_recommender/', train=True, serve=True) model_trainer = ModelTrainer(model=bpr_model) auc_evaluator = AUC() model_trainer.train(total_iter=total_iter, eval_iter=eval_iter, save_iter=save_iter, train_sampler=train_sampler,
from openrec.utils.evaluators import MSE from openrec.utils.samplers import ExplicitSampler batch_size = 32 test_batch_size = 32 display_itr = 4096 update_itr = 4096 max_user = 480189 max_item = 17770 pretrained_user_embeddings = np.load('dataset/netflix/pretrained_user_embeddings.npy') pretrained_item_embeddings = np.load('dataset/netflix/pretrained_item_embeddings.npy') netflix_ratings = np.load('dataset/netflix/netflix_ratings_formatted.npy') train_dataset = Dataset(netflix_ratings[:-int(1e7)], max_user=max_user, max_item=max_item, name='Train') val_dataset = Dataset(netflix_ratings[-int(1e7):-int(5e6)], max_user=max_user, max_item=max_item, name='Val') test_dataset = Dataset(netflix_ratings[-int(5e6):], max_user=max_user, max_item=max_item, name='Test') model = ItrMLP(batch_size=batch_size, max_user=max_user, max_item=max_item, dim_embed=20, opt='SGD', pretrained_user_embeddings=pretrained_user_embeddings, pretrained_item_embeddings=pretrained_item_embeddings, user_dims=[30, 30, 20], item_dims=[30, 30, 20], test_batch_size=test_batch_size) sampler = ExplicitSampler(batch_size=batch_size, dataset=train_dataset, chronological=True) model_trainer = ItrMLPModelTrainer(batch_size=batch_size, test_batch_size=test_batch_size, train_dataset=train_dataset, model=model, sampler=sampler) mse_evaluator = MSE() model_trainer.train(num_itr=int(1e5), display_itr=display_itr, update_itr=update_itr, eval_datasets=[val_dataset, test_dataset],
def sample_data_and_train(self): self.logger.warning( 'sample_data_and_train called, pid = %d Please kill process on unsuccessful training', os.getpid()) self.logger.info('-------- sample_data_and_train starts --------') total_users = 0 interactions_count = 0 with open( os.path.dirname(os.path.abspath(__file__)) + self.path_to_dataset, 'r') as fin: for line in fin: interactions_count += int(line.split()[0]) total_users += 1 self.logger.info('############ collecting data.. ############') # radomly hold out an item per user for validation and testing respectively. val_structured_arr = np.zeros(total_users, dtype=[('user_id', np.int32), ('item_id', np.int32)]) test_structured_arr = np.zeros(total_users, dtype=[('user_id', np.int32), ('item_id', np.int32)]) train_structured_arr = np.zeros(interactions_count - total_users * 2, dtype=[('user_id', np.int32), ('item_id', np.int32)]) interaction_ind = 0 next_user_id = 0 next_item_id = 0 map_to_item_id = dict() # Map item id from 0 to len(items)-1 with open( os.path.dirname(os.path.abspath(__file__)) + self.path_to_dataset, 'r') as fin: for line in fin: item_list = line.split()[1:] random.shuffle(item_list) for ind, item in enumerate(item_list): if item not in map_to_item_id: map_to_item_id[item] = next_item_id next_item_id += 1 if ind == 0: val_structured_arr[next_user_id] = ( next_user_id, map_to_item_id[item]) elif ind == 1: test_structured_arr[next_user_id] = ( next_user_id, map_to_item_id[item]) else: train_structured_arr[interaction_ind] = ( next_user_id, map_to_item_id[item]) interaction_ind += 1 next_user_id += 1 self.logger.info('############ instantiating dataset.. ############') from openrec.utils import Dataset train_dataset = Dataset(raw_data=train_structured_arr, total_users=total_users, total_items=len(map_to_item_id), name='Train') val_dataset = Dataset(raw_data=val_structured_arr, total_users=total_users, total_items=len(map_to_item_id), num_negatives=500, name='Val') test_dataset = Dataset(raw_data=test_structured_arr, total_users=total_users, total_items=len(map_to_item_id), num_negatives=500, name='Test') self.logger.info("############ instantiating Samplers.. ############") from openrec.utils.samplers import RandomPairwiseSampler from openrec.utils.samplers import EvaluationSampler train_sampler = RandomPairwiseSampler(batch_size=1000, dataset=train_dataset, num_process=5) val_sampler = EvaluationSampler(batch_size=1000, dataset=val_dataset) test_sampler = EvaluationSampler(batch_size=1000, dataset=test_dataset) self.logger.info( "############ instantiating Recommender.. ############") from openrec.recommenders import BPR bpr_model = BPR(batch_size=1000, total_users=train_dataset.total_users(), total_items=train_dataset.total_items(), dim_user_embed=50, dim_item_embed=50, save_model_dir='bpr_recommender/', train=True, serve=True) self.logger.info("############ instantiating Evaluator.. ############") from openrec.utils.evaluators import AUC auc_evaluator = AUC() self.logger.info( "############ instantiating Model trainer.. ############") from openrec import ModelTrainer model_trainer = ModelTrainer(model=bpr_model) print("############ starting training.. ############") model_trainer.train( total_iter=10000, # Total number of training iterations eval_iter=1000, # Evaluate the model every "eval_iter" iterations save_iter=10000, # Save the model every "save_iter" iterations train_sampler=train_sampler, eval_samplers=[val_sampler, test_sampler], evaluators=[auc_evaluator]) # self.logger.info("THIS IS WHEN MODEL WILL START TRAINING... returning") self.logger.info("-------- sample_data_and_train ends --------")
def exp(dataset, l2_reg, pos_ratio, neg_ratio, eval_explicit, save_log, eval_rank): if neg_ratio is not None: if pos_ratio + neg_ratio > 1.0 or pos_ratio + neg_ratio <= 0.0: print ("Invalid sampling ratios...") return if dataset == 'spotify': data = loadSpotify() elif dataset == 'bytedance': data = loadByteDance() else: print ("Unsupported dataset...") return # save logging and model log_dir = "validation_logs/{}_{}_{}_{}_{}_{}/".format(dataset, l2_reg, pos_ratio, neg_ratio, eval_explicit, eval_rank) os.popen("mkdir -p %s" % log_dir).read() if save_log: log = open(log_dir + "validation.log", "w") sys.stdout = log # prepare train, val, test sets and samplers train_dataset = Dataset(data['train'], data['total_users'], data['total_items'], name='Train') if neg_ratio is None: train_sampler = StratifiedPointwiseSampler(batch_size=batch_size, dataset=train_dataset, pos_ratio=pos_ratio, num_process=5) else: train_sampler = NegativePointwiseSampler(batch_size=batch_size, dataset=train_dataset, pos_ratio=pos_ratio, neg_ratio=neg_ratio, num_process=5) if neg_ratio > 0.0: print ("Re-weighting implicit negative feedback") else: print ("Corrected negative feedback labels but not re-weighting") eval_num_neg = None if eval_explicit else 500 # num of negative samples for evaluation if eval_rank: # show evaluation metrics for click-complete and click-skip items separately pos_dataset = Dataset(data['pos_test'], data['total_users'], data['total_items'], implicit_negative=not eval_explicit, name='Pos_Test', num_negatives=eval_num_neg) neg_dataset = Dataset(data['neg_test'], data['total_users'], data['total_items'], implicit_negative=not eval_explicit, name='Neg_Test', num_negatives=eval_num_neg) pos_sampler = EvaluationSampler(batch_size=batch_size, dataset=pos_dataset) neg_sampler = EvaluationSampler(batch_size=batch_size, dataset=neg_dataset) eval_samplers = [pos_sampler, neg_sampler] else: val_dataset = Dataset(data['val'], data['total_users'], data['total_items'], implicit_negative=not eval_explicit, name='Val', num_negatives=eval_num_neg) test_dataset = Dataset(data['test'], data['total_users'], data['total_items'], implicit_negative=not eval_explicit, name='Test', num_negatives=eval_num_neg) val_sampler = EvaluationSampler(batch_size=batch_size, dataset=val_dataset) test_sampler = EvaluationSampler(batch_size=batch_size, dataset=test_dataset) eval_samplers = [val_sampler, test_sampler] # set evaluators auc_evaluator = AUC() evaluators = [auc_evaluator] # set model parameters model = PMF(l2_reg=l2_reg, batch_size=batch_size, total_users=train_dataset.total_users(), total_items=train_dataset.total_items(), dim_user_embed=dim_user_embed, dim_item_embed=dim_item_embed, save_model_dir=log_dir, train=True, serve=True) # set model trainer model_trainer = ModelTrainer(model=model) model_trainer.train(total_iter=total_iter, eval_iter=eval_iter, save_iter=save_iter, train_sampler=train_sampler, eval_samplers=eval_samplers, evaluators=evaluators)
total_users = 992 total_items = 14598 train_data = np.load('dataset/lastfm/lastfm_train.npy') test_data = np.load('dataset/lastfm/lastfm_test.npy') dim_item_embed = 50 max_seq_len = 20 total_iter = int(1e5) batch_size = 100 eval_iter = 100 save_iter = eval_iter train_dataset = Dataset(train_data, total_users, total_items, sortby='ts', name='Train') test_dataset = Dataset(test_data, total_users, total_items, sortby='ts', name='Test') train_sampler = TemporalSampler(batch_size=batch_size, max_seq_len=max_seq_len, dataset=train_dataset, num_process=1) test_sampler = TemporalEvaluationSampler(dataset=test_dataset, max_seq_len=max_seq_len)
lastfm_train = np.load('dataset/lastfm/lastfm_train.npy') lastfm_test = np.load('dataset/lastfm/lastfm_test.npy') total_users = 992 total_items = 14598 dim_item_embed = 50 max_seq_len = 100 num_units = 32 batch_size = 256 total_iter = int(1e5) eval_iter = 100 save_iter = eval_iter train_dataset = Dataset(raw_data=lastfm_train, total_users=total_users, total_items=total_items, sortby='ts', name='Train') test_dataset = Dataset(raw_data=lastfm_test, total_users=total_users, total_items=total_items, sortby='ts', name='Test') train_sampler = TemporalSampler(batch_size=batch_size, max_seq_len=max_seq_len, dataset=train_dataset, num_process=1) test_sampler = TemporalEvaluationSampler(dataset=test_dataset, max_seq_len=max_seq_len)