from openrec.utils.evaluators import AUC from openrec.utils.samplers import RandomPairwiseSampler from openrec.utils.samplers import EvaluationSampler import dataloader raw_data = dataloader.load_citeulike() dim_embed = 100 total_iter = 10000 batch_size = 1000 eval_iter = 10000 save_iter = eval_iter train_dataset = Dataset(raw_data['train_data'], raw_data['total_users'], raw_data['total_items'], name='Train') val_dataset = Dataset(raw_data['val_data'], raw_data['total_users'], raw_data['total_items'], name='Val', num_negatives=500) test_dataset = Dataset(raw_data['test_data'], raw_data['total_users'], raw_data['total_items'], name='Test', num_negatives=500) train_sampler = RandomPairwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=5) val_sampler = EvaluationSampler(batch_size=batch_size, dataset=val_dataset) test_sampler = EvaluationSampler(batch_size=batch_size, dataset=test_dataset) bpr_model = BPR(batch_size=batch_size, total_users=train_dataset.total_users(), total_items=train_dataset.total_items(), l2_reg=0.01, dim_user_embed=dim_embed, dim_item_embed=dim_embed, save_model_dir='bpr_recommender/', train=True, serve=True) model_trainer = ModelTrainer(model=bpr_model) auc_evaluator = AUC() model_trainer.train(total_iter=total_iter, eval_iter=eval_iter, save_iter=save_iter, train_sampler=train_sampler, eval_samplers=[val_sampler], evaluators=[auc_evaluator])
raw_data['max_item'], name='Test', num_negatives=500) train_sampler = StratifiedPointwiseSampler(pos_ratio=0.2, batch_size=batch_size, dataset=train_dataset, num_process=5) val_sampler = EvaluationSampler(batch_size=batch_size, dataset=val_dataset) test_sampler = EvaluationSampler(batch_size=batch_size, dataset=test_dataset) model = PMF(batch_size=batch_size, total_users=train_dataset.total_users(), total_items=train_dataset.total_items(), dim_user_embed=dim_embed, dim_item_embed=dim_embed, save_model_dir='pmf_recommender/', train=True, serve=True) auc_evaluator = AUC() recall_evaluator = Recall(recall_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) model_trainer = ModelTrainer(model=model) model_trainer.train(total_it=total_it, eval_it=eval_it, save_it=save_it, train_sampler=train_sampler, eval_samplers=[val_sampler, test_sampler], evaluators=[auc_evaluator, recall_evaluator])
total_users=total_users, total_items=total_items, sortby='ts', name='Test') train_sampler = TemporalSampler(batch_size=batch_size, max_seq_len=max_seq_len, dataset=train_dataset, num_process=1) test_sampler = TemporalEvaluationSampler(dataset=test_dataset, max_seq_len=max_seq_len) rnn_model = RNNRec(batch_size=batch_size, dim_item_embed=dim_item_embed, max_seq_len=max_seq_len, total_items=train_dataset.total_items(), num_units=num_units, save_model_dir='rnn_recommender/', train=True, serve=True) model_trainer = ModelTrainer(model=rnn_model) auc_evaluator = AUC() recall_evaluator = Recall(recall_at=[100, 500]) model_trainer.train(total_iter=total_iter, eval_iter=eval_iter, save_iter=save_iter, train_sampler=train_sampler, eval_samplers=[test_sampler], evaluators=[auc_evaluator, recall_evaluator])
def sample_data_and_train(self): self.logger.warning( 'sample_data_and_train called, pid = %d Please kill process on unsuccessful training', os.getpid()) self.logger.info('-------- sample_data_and_train starts --------') total_users = 0 interactions_count = 0 with open( os.path.dirname(os.path.abspath(__file__)) + self.path_to_dataset, 'r') as fin: for line in fin: interactions_count += int(line.split()[0]) total_users += 1 self.logger.info('############ collecting data.. ############') # radomly hold out an item per user for validation and testing respectively. val_structured_arr = np.zeros(total_users, dtype=[('user_id', np.int32), ('item_id', np.int32)]) test_structured_arr = np.zeros(total_users, dtype=[('user_id', np.int32), ('item_id', np.int32)]) train_structured_arr = np.zeros(interactions_count - total_users * 2, dtype=[('user_id', np.int32), ('item_id', np.int32)]) interaction_ind = 0 next_user_id = 0 next_item_id = 0 map_to_item_id = dict() # Map item id from 0 to len(items)-1 with open( os.path.dirname(os.path.abspath(__file__)) + self.path_to_dataset, 'r') as fin: for line in fin: item_list = line.split()[1:] random.shuffle(item_list) for ind, item in enumerate(item_list): if item not in map_to_item_id: map_to_item_id[item] = next_item_id next_item_id += 1 if ind == 0: val_structured_arr[next_user_id] = ( next_user_id, map_to_item_id[item]) elif ind == 1: test_structured_arr[next_user_id] = ( next_user_id, map_to_item_id[item]) else: train_structured_arr[interaction_ind] = ( next_user_id, map_to_item_id[item]) interaction_ind += 1 next_user_id += 1 self.logger.info('############ instantiating dataset.. ############') from openrec.utils import Dataset train_dataset = Dataset(raw_data=train_structured_arr, total_users=total_users, total_items=len(map_to_item_id), name='Train') val_dataset = Dataset(raw_data=val_structured_arr, total_users=total_users, total_items=len(map_to_item_id), num_negatives=500, name='Val') test_dataset = Dataset(raw_data=test_structured_arr, total_users=total_users, total_items=len(map_to_item_id), num_negatives=500, name='Test') self.logger.info("############ instantiating Samplers.. ############") from openrec.utils.samplers import RandomPairwiseSampler from openrec.utils.samplers import EvaluationSampler train_sampler = RandomPairwiseSampler(batch_size=1000, dataset=train_dataset, num_process=5) val_sampler = EvaluationSampler(batch_size=1000, dataset=val_dataset) test_sampler = EvaluationSampler(batch_size=1000, dataset=test_dataset) self.logger.info( "############ instantiating Recommender.. ############") from openrec.recommenders import BPR bpr_model = BPR(batch_size=1000, total_users=train_dataset.total_users(), total_items=train_dataset.total_items(), dim_user_embed=50, dim_item_embed=50, save_model_dir='bpr_recommender/', train=True, serve=True) self.logger.info("############ instantiating Evaluator.. ############") from openrec.utils.evaluators import AUC auc_evaluator = AUC() self.logger.info( "############ instantiating Model trainer.. ############") from openrec import ModelTrainer model_trainer = ModelTrainer(model=bpr_model) print("############ starting training.. ############") model_trainer.train( total_iter=10000, # Total number of training iterations eval_iter=1000, # Evaluate the model every "eval_iter" iterations save_iter=10000, # Save the model every "save_iter" iterations train_sampler=train_sampler, eval_samplers=[val_sampler, test_sampler], evaluators=[auc_evaluator]) # self.logger.info("THIS IS WHEN MODEL WILL START TRAINING... returning") self.logger.info("-------- sample_data_and_train ends --------")
def exp(dataset, l2_reg, pos_ratio, neg_ratio, eval_explicit, save_log, eval_rank): if neg_ratio is not None: if pos_ratio + neg_ratio > 1.0 or pos_ratio + neg_ratio <= 0.0: print ("Invalid sampling ratios...") return if dataset == 'spotify': data = loadSpotify() elif dataset == 'bytedance': data = loadByteDance() else: print ("Unsupported dataset...") return # save logging and model log_dir = "validation_logs/{}_{}_{}_{}_{}_{}/".format(dataset, l2_reg, pos_ratio, neg_ratio, eval_explicit, eval_rank) os.popen("mkdir -p %s" % log_dir).read() if save_log: log = open(log_dir + "validation.log", "w") sys.stdout = log # prepare train, val, test sets and samplers train_dataset = Dataset(data['train'], data['total_users'], data['total_items'], name='Train') if neg_ratio is None: train_sampler = StratifiedPointwiseSampler(batch_size=batch_size, dataset=train_dataset, pos_ratio=pos_ratio, num_process=5) else: train_sampler = NegativePointwiseSampler(batch_size=batch_size, dataset=train_dataset, pos_ratio=pos_ratio, neg_ratio=neg_ratio, num_process=5) if neg_ratio > 0.0: print ("Re-weighting implicit negative feedback") else: print ("Corrected negative feedback labels but not re-weighting") eval_num_neg = None if eval_explicit else 500 # num of negative samples for evaluation if eval_rank: # show evaluation metrics for click-complete and click-skip items separately pos_dataset = Dataset(data['pos_test'], data['total_users'], data['total_items'], implicit_negative=not eval_explicit, name='Pos_Test', num_negatives=eval_num_neg) neg_dataset = Dataset(data['neg_test'], data['total_users'], data['total_items'], implicit_negative=not eval_explicit, name='Neg_Test', num_negatives=eval_num_neg) pos_sampler = EvaluationSampler(batch_size=batch_size, dataset=pos_dataset) neg_sampler = EvaluationSampler(batch_size=batch_size, dataset=neg_dataset) eval_samplers = [pos_sampler, neg_sampler] else: val_dataset = Dataset(data['val'], data['total_users'], data['total_items'], implicit_negative=not eval_explicit, name='Val', num_negatives=eval_num_neg) test_dataset = Dataset(data['test'], data['total_users'], data['total_items'], implicit_negative=not eval_explicit, name='Test', num_negatives=eval_num_neg) val_sampler = EvaluationSampler(batch_size=batch_size, dataset=val_dataset) test_sampler = EvaluationSampler(batch_size=batch_size, dataset=test_dataset) eval_samplers = [val_sampler, test_sampler] # set evaluators auc_evaluator = AUC() evaluators = [auc_evaluator] # set model parameters model = PMF(l2_reg=l2_reg, batch_size=batch_size, total_users=train_dataset.total_users(), total_items=train_dataset.total_items(), dim_user_embed=dim_user_embed, dim_item_embed=dim_item_embed, save_model_dir=log_dir, train=True, serve=True) # set model trainer model_trainer = ModelTrainer(model=model) model_trainer.train(total_iter=total_iter, eval_iter=eval_iter, save_iter=save_iter, train_sampler=train_sampler, eval_samplers=eval_samplers, evaluators=evaluators)
num_process=5) val_sampler = EvaluationSampler(batch_size=batch_size, dataset=val_dataset) test_sampler = EvaluationSampler(batch_size=batch_size, dataset=test_dataset) model = UCML(batch_size=batch_size, total_users=train_dataset.total_users(), total_items=train_dataset.total_items(), dim_user_embed=dim_embed, dim_item_embed=dim_embed, save_model_dir='ucml_recommender/', train=True, serve=True) def train_iter_func(model, batch_data): loss = model.train(batch_data)['losses'][0] model.train(batch_data, operations_id='censor_embedding') return loss model_trainer = ModelTrainer(model=model, train_iter_func=train_iter_func) auc_evaluator = AUC() recall_evaluator = Recall(recall_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) model_trainer.train(total_iter=int(1e5), eval_iter=eval_iter, save_iter=save_iter, train_sampler=train_sampler, eval_samplers=[val_sampler, test_sampler], evaluators=[auc_evaluator, recall_evaluator])