class VAECF_train(TrainEngine): """VAECF_train Class.""" def __init__(self, args): """Initialize VAECF_train Class.""" print(args) super(VAECF_train, self).__init__(args) def train(self): """Train the model.""" self.load_dataset() self.gpu_id, self.config["device_str"] = self.get_device() """ Main training navigator Returns: """ self.monitor = Monitor(log_dir=self.config["system"]["run_dir"], delay=1, gpu_id=self.gpu_id) train_loader = self.data.instance_vae_loader( batch_size=self.config["model"]["batch_size"], device=self.config["model"]["device_str"], ) self.config["model"]["n_items"] = self.data.n_items self.config["model"]["n_users"] = self.data.n_users self.engine = VAECFEngine(self.config) self.model_save_dir = os.path.join( self.config["system"]["model_save_dir"], self.config["model"]["save_name"]) self._train(self.engine, train_loader, self.model_save_dir) self.config["run_time"] = self.monitor.stop() return self.eval_engine.best_valid_performance
class LightGCN_train(TrainEngine): """An instance class from the TrainEngine base class.""" def __init__(self, config): """Initialize LightGCN_train Class. Args: config (dict): All the parameters for the model. """ self.config = config super(LightGCN_train, self).__init__(config) self.load_dataset() self.build_data_loader() self.engine = LightGCNEngine(self.config) def build_data_loader(self): """Missing Doc.""" # ToDo: Please define the directory to store the adjacent matrix self.sample_generator = DataLoaderBase(ratings=self.data.train) adj_mat, norm_adj_mat, mean_adj_mat = self.sample_generator.get_adj_mat( self.config) norm_adj = sparse_mx_to_torch_sparse_tensor(norm_adj_mat) self.config["model"]["norm_adj"] = norm_adj self.config["model"]["n_users"] = self.data.n_users self.config["model"]["n_items"] = self.data.n_items def train(self): """Train the model.""" self.monitor = Monitor(log_dir=self.config["system"]["run_dir"], delay=1, gpu_id=self.gpu_id) self.model_save_dir = os.path.join( self.config["system"]["model_save_dir"], self.config["model"]["save_name"]) self.max_n_update = self.config["model"]["max_n_update"] for epoch in range(self.config["model"]["max_epoch"]): print(f"Epoch {epoch} starts !") print("-" * 80) if epoch > 0 and self.eval_engine.n_no_update == 0: # previous epoch have already obtained better result self.engine.save_checkpoint(model_dir=self.model_save_dir) if self.eval_engine.n_no_update >= self.max_n_update: print( "Early stop criterion triggered, no performance update for {:} times" .format(self.max_n_update)) break train_loader = self.sample_generator.pairwise_negative_train_loader( self.config["model"]["batch_size"], self.config["model"]["device_str"]) self.engine.train_an_epoch(epoch_id=epoch, train_loader=train_loader) self.eval_engine.train_eval(self.data.valid[0], self.data.test[0], self.engine.model, epoch) self.config["run_time"] = self.monitor.stop() def test(self): """Test the model.""" self.engine.resume_checkpoint(model_dir=self.model_save_dir) super(LightGCN_train, self).test()
class VBCAR_train(TrainEngine): """ An instance class from the TrainEngine base class """ def __init__(self, config): """Constructor Args: config (dict): All the parameters for the model """ self.config = config super(VBCAR_train, self).__init__(self.config) self.load_dataset() self.train_data = self.dataset.sample_triple() self.config["alpha_step"] = (1 - self.config["alpha"]) / ( self.config["max_epoch"]) self.engine = VBCAREngine(self.config) def train(self): """Default train implementation """ assert hasattr(self, "engine"), "Please specify the exact model engine !" self.monitor = Monitor(log_dir=self.config["run_dir"], delay=1, gpu_id=self.gpu_id) self.engine.data = self.dataset print("Start training... ") epoch_bar = tqdm(range(self.config["max_epoch"]), file=sys.stdout) for epoch in epoch_bar: print(f"Epoch {epoch} starts !") print("-" * 80) if epoch > 0 and self.eval_engine.n_no_update == 0: # previous epoch have already obtained better result self.engine.save_checkpoint(model_dir=os.path.join( self.config["model_save_dir"], "model.cpk")) if self.eval_engine.n_no_update >= MAX_N_UPDATE: print( "Early stop criterion triggered, no performance update for {:} times" .format(MAX_N_UPDATE)) break data_loader = self.build_data_loader() self.engine.train_an_epoch(data_loader, epoch_id=epoch) self.eval_engine.train_eval(self.dataset.valid[0], self.dataset.test[0], self.engine.model, epoch) # anneal alpha self.engine.model.alpha = min( self.config["alpha"] + math.exp(epoch - self.config["max_epoch"] + 20), 1, ) """Sets the learning rate to the initial LR decayed by 10 every 10 epochs""" lr = self.config["lr"] * (0.5**(epoch // 10)) for param_group in self.engine.optimizer.param_groups: param_group["lr"] = lr self.config["run_time"] = self.monitor.stop() return self.eval_engine.best_valid_performance
class LCFN_train(TrainEngine): """An instance class from the TrainEngine base class.""" def __init__(self, config): """Initialize NGCF_train Class. Args: config (dict): All the parameters for the model. """ self.config = config print(config) super(LCFN_train, self).__init__(self.config) self.load_dataset() self.build_data_loader() def build_data_loader(self): """Missing Doc.""" # ToDo: Please define the directory to store the adjacent matrix self.gpu_id, self.config["model"]["device_str"] = self.get_device() self.sample_generator = DataLoaderBase(ratings=self.data.train) graph_embeddings = self.sample_generator.get_graph_embeddings( self.config) self.config["model"]["graph_embeddings"] = graph_embeddings self.config["model"]["n_users"] = self.data.n_users self.config["model"]["n_items"] = self.data.n_items def train(self): """Train the model.""" self.monitor = Monitor(log_dir=self.config["system"]["run_dir"], delay=1, gpu_id=self.gpu_id) self.model_save_dir = os.path.join( self.config["system"]["model_save_dir"], self.config["model"]["save_name"]) if self.config["model"]["loss"] == "bpr": train_loader = self.data.instance_bpr_loader( batch_size=self.config["model"]["batch_size"], device=self.config["model"]["device_str"], ) elif self.config["model"]["loss"] == "bce": train_loader = self.data.instance_bce_loader( num_negative=self.config["model"]["num_negative"], batch_size=self.config["model"]["batch_size"], device=self.config["model"]["device_str"], ) else: raise ValueError( f"Unsupported loss type {self.config['loss']}, try other options: 'bpr' or 'bce'" ) self.engine = LCFNEngine(self.config) self._train(self.engine, train_loader, self.model_save_dir) self.config["run_time"] = self.monitor.stop() return self.eval_engine.best_valid_performance
class MF_train(TrainEngine): """MF_train Class.""" def __init__(self, args): """Initialize MF_train Class.""" print(args) super(MF_train, self).__init__(args) def train(self): """Train the model.""" self.load_dataset() self.gpu_id, self.config["device_str"] = self.get_device() """ Main training navigator Returns: """ # Train NeuMF without pre-train self.monitor = Monitor(log_dir=self.config["system"]["run_dir"], delay=1, gpu_id=self.gpu_id) if self.config["model"]["loss"] == "bpr": train_loader = self.data.instance_bpr_loader( batch_size=self.config["model"]["batch_size"], device=self.config["model"]["device_str"], ) elif self.config["model"]["loss"] == "bce": train_loader = self.data.instance_bce_loader( num_negative=self.config["model"]["num_negative"], batch_size=self.config["model"]["batch_size"], device=self.config["model"]["device_str"], ) else: raise ValueError( f"Unsupported loss type {self.config['loss']}, try other options: 'bpr' or 'bce'" ) self.engine = MFEngine(self.config) self.model_save_dir = os.path.join( self.config["system"]["model_save_dir"], self.config["model"]["save_name"]) self._train(self.engine, train_loader, self.model_save_dir) self.config["run_time"] = self.monitor.stop() return self.eval_engine.best_valid_performance
class Triple2vec_train(TrainEngine): """ An instance class from the TrainEngine base class """ def __init__(self, config): """Constructor Args: config (dict): All the parameters for the model """ self.config = config super(Triple2vec_train, self).__init__(self.config) self.gpu_id, self.config["device_str"] = self.get_device() def load_dataset(self): """Load dataset.""" split_data = load_split_dataset(self.config) self.data = GroceryData(split_dataset=split_data, config=self.config) self.config["model"]["n_users"] = self.data.n_users self.config["model"]["n_items"] = self.data.n_items def train(self): self.load_dataset() self.engine = Triple2vecEngine(self.config) self.engine.data = self.data self.train_data = self.data.sample_triple() train_loader = DataLoader( torch.LongTensor(self.train_data.to_numpy()).to(self.engine.device), batch_size=self.config["model"]["batch_size"], shuffle=True, drop_last=True, ) self.monitor = Monitor( log_dir=self.config["system"]["run_dir"], delay=1, gpu_id=self.gpu_id ) self.model_save_dir = os.path.join( self.config["system"]["model_save_dir"], self.config["model"]["save_name"] ) self._train(self.engine, train_loader, self.model_save_dir) self.config["run_time"] = self.monitor.stop() return self.eval_engine.best_valid_performance
class LightGCN_train(TrainEngine): """An instance class from the TrainEngine base class.""" def __init__(self, config): """Initialize LightGCN_train Class. Args: config (dict): All the parameters for the model. """ self.config = config super(LightGCN_train, self).__init__(config) self.load_dataset() self.build_data_loader() self.engine = LightGCNEngine(self.config) def build_data_loader(self): """Missing Doc.""" adj_mat, norm_adj_mat, mean_adj_mat = self.data.get_adj_mat( self.config) norm_adj = sparse_mx_to_torch_sparse_tensor(norm_adj_mat) self.config["model"]["norm_adj"] = norm_adj self.config["model"]["n_users"] = self.data.n_users self.config["model"]["n_items"] = self.data.n_items def train(self): """Train the model.""" self.monitor = Monitor(log_dir=self.config["system"]["run_dir"], delay=1, gpu_id=self.gpu_id) self.model_save_dir = os.path.join( self.config["system"]["model_save_dir"], self.config["model"]["save_name"]) self.engine = LightGCNEngine(self.config) train_loader = self.data.instance_bpr_loader( batch_size=self.config["model"]["batch_size"], device=self.config["model"]["device_str"], ) self._train(self.engine, train_loader, self.model_save_dir) self.config["run_time"] = self.monitor.stop() return self.eval_engine.best_valid_performance
class UltraGCN_train(TrainEngine): """An instance class from the TrainEngine base class.""" def __init__(self, config): """Initialize UltraGCN_train Class. Args: config (dict): All the parameters for the model. """ self.config = config super(UltraGCN_train, self).__init__(config) self.load_dataset() self.build_data_loader() self.engine = UltraGCNEngine(self.config) def build_data_loader(self): """Load all matrix.""" train_mat, constraint_mat = self.data.get_constraint_mat(self.config) # norm_adj = sparse_mx_to_torch_sparse_tensor(norm_adj_mat) self.config["model"]["train_mat"] = train_mat self.config["model"]["constraint_mat"] = constraint_mat self.config["model"]["n_users"] = self.data.n_users self.config["model"]["n_items"] = self.data.n_items def train(self): """Train the model.""" self.monitor = Monitor(log_dir=self.config["system"]["run_dir"], delay=1, gpu_id=self.gpu_id) self.model_save_dir = os.path.join( self.config["system"]["model_save_dir"], self.config["model"]["save_name"]) train_loader = self.data.instance_mul_neg_loader( batch_size=self.config["model"]["batch_size"], device=self.config["model"]["device_str"], num_negative=self.config["model"]["negative_num"], ) self._train(self.engine, train_loader, self.model_save_dir) self.config["run_time"] = self.monitor.stop() return self.eval_engine.best_valid_performance
class SGL_train(TrainEngine): """An instance class from the TrainEngine base class.""" def __init__(self, config): """Initialize SGL_train Class. Args: config (dict): All the parameters for the model. """ self.config = config super(SGL_train, self).__init__(config) self.load_dataset() self.build_data_loader() self.engine = SGLEngine(self.config) def build_data_loader(self): self.config["model"]["n_users"] = self.data.n_users self.config["model"]["n_items"] = self.data.n_items norm_adj = self.data.create_sgl_mat(self.config) self.config["model"]["norm_adj"] = norm_adj def train(self): """Train the model.""" self.monitor = Monitor(log_dir=self.config["system"]["run_dir"], delay=1, gpu_id=self.gpu_id) self.model_save_dir = os.path.join( self.config["system"]["model_save_dir"], self.config["model"]["save_name"]) self.engine = SGLEngine(self.config) train_loader = self.data.instance_bpr_loader( batch_size=self.config["model"]["batch_size"], device=self.config["model"]["device_str"], ) self._train(self.engine, train_loader, self.model_save_dir) self.config["run_time"] = self.monitor.stop() return self.eval_engine.best_valid_performance
class NARM_train(TrainEngine): """ An instance class from the TrainEngine base class """ def __init__(self, config): """Constructor Args: config (dict): All the parameters for the model """ self.config = config super(NARM_train, self).__init__(self.config) self.load_dataset_seq() self.build_data_loader() self.engine = NARMEngine(self.config) self.seq_eval_engine = SeqEvalEngine(self.config) def load_dataset_seq(self): """ Default implementation of building dataset Returns: None """ # ml = Movielens_100k() # ml.download() # ml.load_interaction() # self.dataset = ml.make_temporal_split(n_negative=0, n_test=0) ld_dataset = load_dataset(self.config) ld_dataset.download() ld_dataset.load_interaction() self.dataset = ld_dataset.make_temporal_split(n_negative=0, n_test=0) self.train_data = self.dataset[self.dataset.col_flag == "train"] self.valid_data = self.dataset[self.dataset.col_flag == "validate"] self.test_data = self.dataset[self.dataset.col_flag == "test"] # self.dataset = Dataset(self.config) self.config["n_users"] = self.train_data.col_user.nunique() self.config["n_items"] = self.train_data.col_item.nunique() + 1 def build_data_loader(self): """ Convert users' interactions to sequences Returns: load_train_data (DataLoader): training set. """ # reindex items from 1 self.train_data, self.valid_data, self.test_data = reindex_items( self.train_data, self.valid_data, self.test_data) # data to sequences self.valid_data = create_seq_db(self.valid_data) self.test_data = create_seq_db(self.test_data) # convert interactions to sequences seq_train_data = create_seq_db(self.train_data) # convert sequences to (seq, target) format load_train_data = dataset_to_seq_target_format(seq_train_data) # define pytorch Dataset class for sequential datasets load_train_data = SeqDataset(load_train_data) # pad the sequences with 0 self.load_train_data = DataLoader( load_train_data, batch_size=self.config["batch_size"], shuffle=False, collate_fn=collate_fn, ) return self.load_train_data def _train(self, engine, train_loader, save_dir): """Train the model with epochs Retruns: None """ epoch_bar = tqdm(range(self.config["max_epoch"]), file=sys.stdout) for epoch in epoch_bar: print("Epoch {} starts !".format(epoch)) print("-" * 80) if self.check_early_stop(engine, save_dir, epoch): break engine.train_an_epoch(train_loader, epoch=epoch) """evaluate model on validation and test sets""" # evaluation self.seq_eval_engine.train_eval_seq(self.valid_data, self.test_data, engine, epoch) def train(self): """ Train and test NARM Returns: None """ self.monitor = Monitor(log_dir=self.config["run_dir"], delay=1, gpu_id=self.gpu_id) train_loader = self.load_train_data self.engine = NARMEngine(self.config) self.narm_save_dir = os.path.join(self.config["model_save_dir"], self.config["save_name"]) self._train(self.engine, train_loader, self.narm_save_dir) self.config["run_time"] = self.monitor.stop() self.seq_eval_engine.test_eval_seq(self.test_data, self.engine)
class NCF_train(TrainEngine): """ An instance class from the TrainEngine base class """ def __init__(self, config): """Constructor Args: config (dict): All the parameters for the model """ self.config = config super(NCF_train, self).__init__(self.config) self.load_dataset() self.build_data_loader() self.gpu_id, self.config["model"]["device_str"] = self.get_device() def build_data_loader(self): # ToDo: Please define the directory to store the adjacent matrix self.sample_generator = DataLoaderBase(ratings=self.data.train) self.config["model"]["num_batch"] = ( self.data.n_train // self.config["model"]["batch_size"] + 1) self.config["model"]["n_users"] = self.data.n_users self.config["model"]["n_items"] = self.data.n_items def train(self): """ Main training navigator Returns: """ # Options are: 'mlp', 'gmf', 'ncf_end', and 'ncf_pre'; # Train NeuMF without pre-train if self.config["model"]["model"] == "ncf_end": self.train_ncf() elif self.config["model"]["model"] == "gmf": self.train_gmf() elif self.config["model"]["model"] == "mlp": self.train_mlp() elif self.config["model"]["model"] == "ncf_pre": self.train_gmf() self.train_mlp() self.train_ncf() else: raise ValueError( "Model type error: Options are: 'mlp', 'gmf', 'ncf_end', and 'ncf_pre'." ) def train_ncf(self): """ Train NeuMF Returns: None """ self.monitor = Monitor(log_dir=self.config["system"]["run_dir"], delay=1, gpu_id=self.gpu_id) train_loader = self.sample_generator.instance_a_train_loader( self.config["model"]["num_negative"], self.config["model"]["batch_size"]) self.engine = NeuMFEngine(self.config) self.neumf_save_dir = os.path.join( self.config["system"]["model_save_dir"], self.config["model"]["neumf_config"]["save_name"], ) self._train(self.engine, train_loader, self.neumf_save_dir) self.config["run_time"] = self.monitor.stop() self.eval_engine.test_eval(self.data.test, self.engine.model) def train_gmf(self): """ Train GMF Returns: None """ self.monitor = Monitor(log_dir=self.config["system"]["run_dir"], delay=1, gpu_id=self.gpu_id) train_loader = self.sample_generator.instance_a_train_loader( self.config["model"]["num_negative"], self.config["model"]["batch_size"]) # Train GMF self.engine = GMFEngine(self.config) self.gmf_save_dir = os.path.join( self.config["system"]["model_save_dir"], self.config["model"]["gmf_config"]["save_name"], ) self._train(self.engine, train_loader, self.gmf_save_dir) while self.eval_engine.n_worker: print("Wait 15s for the complete of eval_engine.n_worker") time.sleep(15) # wait the self.config["run_time"] = self.monitor.stop() self.eval_engine.test_eval(self.data.test, self.engine.model) def train_mlp(self): """ Train MLP Returns: None """ # Train MLP self.monitor = Monitor(log_dir=self.config["system"]["run_dir"], delay=1, gpu_id=self.gpu_id) train_loader = self.sample_generator.instance_a_train_loader( self.config["model"]["num_negative"], self.config["model"]["batch_size"]) self.engine = MLPEngine(self.config) self.mlp_save_dir = os.path.join( self.config["system"]["model_save_dir"], self.config["model"]["mlp_config"]["save_name"], ) self._train(self.engine, train_loader, self.mlp_save_dir) while self.eval_engine.n_worker: print("Wait 15s for the complete of eval_engine.n_worker") time.sleep(15) # wait the self.config["run_time"] = self.monitor.stop() self.eval_engine.test_eval(self.data.test, self.engine.model)
class NCF_train(TrainEngine): """ An instance class from the TrainEngine base class """ def __init__(self, config): """Constructor Args: config (dict): All the parameters for the model """ self.config = config super(NCF_train, self).__init__(self.config) self.load_dataset() self.build_data_loader() self.sample_generator = SampleGenerator(ratings=self.dataset.train) # update model config def build_data_loader(self): # ToDo: Please define the directory to store the adjacent matrix user_fea_norm_adj, item_fea_norm_adj = self.dataset.make_fea_sim_mat() self.sample_generator = SampleGenerator(ratings=self.dataset.train) self.config["user_fea_norm_adj"] = sparse_mx_to_torch_sparse_tensor( user_fea_norm_adj) self.config["item_fea_norm_adj"] = sparse_mx_to_torch_sparse_tensor( item_fea_norm_adj) self.config["num_batch"] = self.dataset.n_train // self.config[ "batch_size"] + 1 self.config["n_users"] = self.dataset.n_users self.config["n_items"] = self.dataset.n_items def check_early_stop(self, engine, model_dir, epoch): """ Check if early stop criterion is triggered Save model if previous epoch have already obtained better result Args: epoch (int): epoch num Returns: True: if early stop criterion is triggered False: else """ if epoch > 0 and self.eval_engine.n_no_update == 0: # save model if previous epoch have already obtained better result engine.save_checkpoint(model_dir=model_dir) if self.eval_engine.n_no_update >= MAX_N_UPDATE: # stop training if early stop criterion is triggered print( "Early stop criterion triggered, no performance update for {:} times" .format(MAX_N_UPDATE)) return True return False def _train(self, engine, train_loader, save_dir): self.eval_engine.flush() epoch_bar = tqdm(range(self.config["max_epoch"]), file=sys.stdout) for epoch in epoch_bar: print("Epoch {} starts !".format(epoch)) print("-" * 80) if self.check_early_stop(engine, save_dir, epoch): break engine.train_an_epoch(train_loader, epoch_id=epoch) """evaluate model on validation and test sets""" self.eval_engine.train_eval(self.dataset.valid[0], self.dataset.test[0], engine.model, epoch) def train(self): self.monitor = Monitor(log_dir=self.config["run_dir"], delay=1, gpu_id=self.gpu_id) train_loader = self.sample_generator.instance_a_train_loader( self.config["num_negative"], self.config["batch_size"]) # Train ncf without pretrain self.config["pretrain"] = None self.config["model"] = "NCF_wo_pre" self.engine = NeuMFEngine(self.config) self.neumf_save_dir = os.path.join( self.config["model_save_dir"], self.config["neumf_config"]["save_name"]) self._train(self.engine, train_loader, self.neumf_save_dir) while self.eval_engine.n_worker: print(f"Wait 15s for the complete of eval_engine.n_worker") time.sleep(15) # wait the self.config["run_time"] = self.monitor.stop() self.eval_engine.test_eval(self.dataset.test, self.engine.model) self.monitor = Monitor(log_dir=self.config["run_dir"], delay=1, gpu_id=self.gpu_id) # Train GCN self.config["pretrain"] = None self.config["model"] = "GCN" self.engine = GCN_SEngine(self.config) self.gcn_save_dir = os.path.join( self.config["model_save_dir"], self.config["gcn_config"]["save_name"]) self._train(engine=self.engine, train_loader=self.dataset, save_dir=self.gcn_save_dir) while self.eval_engine.n_worker: print(f"Wait 15s for the complete of eval_engine.n_worker") time.sleep(15) # wait the self.config["run_time"] = self.monitor.stop() self.eval_engine.test_eval(self.dataset.test, self.engine.model) self.monitor = Monitor(log_dir=self.config["run_dir"], delay=1, gpu_id=self.gpu_id) # Train GMF self.config["pretrain"] = None self.config["model"] = "GMF" self.engine = GMFEngine(self.config) self.gmf_save_dir = os.path.join( self.config["model_save_dir"], self.config["gmf_config"]["save_name"]) self._train(self.engine, train_loader, self.gmf_save_dir) while self.eval_engine.n_worker: print(f"Wait 15s for the complete of eval_engine.n_worker") time.sleep(15) # wait the self.config["run_time"] = self.monitor.stop() self.eval_engine.test_eval(self.dataset.test, self.engine.model) self.monitor = Monitor(log_dir=self.config["run_dir"], delay=1, gpu_id=self.gpu_id) # Train MLP self.config["pretrain"] = None self.config["model"] = "mlp" self.engine = MLPEngine(self.config) self.mlp_save_dir = os.path.join( self.config["model_save_dir"], self.config["mlp_config"]["save_name"]) self._train(self.engine, train_loader, self.mlp_save_dir) while self.eval_engine.n_worker: print(f"Wait 15s for the complete of eval_engine.n_worker") time.sleep(15) # wait the self.config["run_time"] = self.monitor.stop() self.eval_engine.test_eval(self.dataset.test, self.engine.model) self.monitor = Monitor(log_dir=self.config["run_dir"], delay=1, gpu_id=self.gpu_id) # Train ncf_gmf self.config["pretrain"] = "gmf" self.config["model"] = "ncf_gmf" self.engine = NeuMFEngine(self.config) self.neumf_save_dir = os.path.join( self.config["model_save_dir"], self.config["neumf_config"]["save_name"]) self._train(self.engine, train_loader, self.neumf_save_dir) while self.eval_engine.n_worker: print(f"Wait 15s for the complete of eval_engine.n_worker") time.sleep(15) # wait the self.config["run_time"] = self.monitor.stop() self.eval_engine.test_eval(self.dataset.test, self.engine.model) self.monitor = Monitor(log_dir=self.config["run_dir"], delay=1, gpu_id=self.gpu_id) # Train ncf_gcn self.config["pretrain"] = "gcn" self.config["model"] = "ncf_gcn" self.engine = NeuMFEngine(self.config) self.neumf_save_dir = os.path.join( self.config["model_save_dir"], self.config["neumf_config"]["save_name"]) self._train(self.engine, train_loader, self.neumf_save_dir) while self.eval_engine.n_worker: print(f"Wait 15s for the complete of eval_engine.n_worker") time.sleep(15) # wait the self.config["run_time"] = self.monitor.stop() self.eval_engine.test_eval(self.dataset.test, self.engine.model)
class TrainEngine(object): """Training engine for all the models. """ def __init__(self, config): """Initialing Args: config (dict): Config dict received from command line. Should have the config["config_file"]. Attributes: dataset (Dataset): A dataset containing DataFrame of train, validation and test. train_data (DataLoader): Extracted training data or train DataLoader, need to be implement. monitor (Monitor): An monitor object that monitor the computational resources. engine (Model Engine) """ self.dataset = None self.train_data = None self.monitor = None self.engine = None self.config = prepare_env(config) self.gpu_id, self.config["device_str"] = get_device() if self.config["device"] == "gpu" else (None, "cpu") self.eval_engine = EvalEngine(self.config) def load_dataset(self): """ Default implementation of building dataset Returns: None """ self.dataset = data_util.Dataset(self.config) self.config["item_fea"] = self.dataset.item_feature self.config["user_fea"] = self.dataset.user_feature self.config["n_users"] = self.dataset.n_users self.config["n_items"] = self.dataset.n_items # noinspection PyTypeChecker def build_data_loader(self): """ Default data builder Returns: DataLoader """ return DataLoader( torch.LongTensor(self.train_data.to_numpy()).to(self.engine.device), batch_size=self.config["batch_size"], shuffle=True, drop_last=True, ) def train(self): """Default train implementation """ assert hasattr(self, "engine"), "Please specify the exact model engine !" self.monitor = Monitor( log_dir=self.config["run_dir"], delay=1, gpu_id=self.gpu_id ) self.engine.data = self.dataset print("Start training... ") epoch_bar = tqdm(range(self.config["max_epoch"]), file=sys.stdout) for epoch in epoch_bar: print(f"Epoch {epoch} starts !") print("-" * 80) if epoch > 0 and self.eval_engine.n_no_update == 0: # previous epoch have already obtained better result self.engine.save_checkpoint( model_dir=os.path.join(self.config["model_save_dir"], "model.cpk") ) if self.eval_engine.n_no_update >= MAX_N_UPDATE: print( "Early stop criterion triggered, no performance update for {:} times".format( MAX_N_UPDATE ) ) break data_loader = self.build_data_loader() self.engine.train_an_epoch(data_loader, epoch_id=epoch) self.eval_engine.train_eval( self.dataset.valid[0], self.dataset.test[0], self.engine.model, epoch ) """Sets the learning rate to the initial LR decayed by 10 every 10 epochs""" lr = self.config["lr"] * (0.5 ** (epoch // 10)) for param_group in self.engine.optimizer.param_groups: param_group["lr"] = lr self.config["run_time"] = self.monitor.stop() return self.eval_engine.best_valid_performance def test(self): """Evaluate the performance for the testing sets based on the final model. """ self.eval_engine.test_eval(self.dataset.test, self.engine.model)
class cmn_train(TrainEngine): """ An instance class from the TrainEngine base class """ def __init__(self, config): """Constructor Args: config (dict): All the parameters for the model """ self.config = config super(cmn_train, self).__init__(self.config) self.load_dataset() self.gmfengine = PairwiseGMFEngine(self.config) self.cmnengine = cmnEngine self.gpu_id, self.config["device_str"] = self.get_device() def train_gmf(self): self.monitor = Monitor(log_dir=self.config["run_dir"], delay=1, gpu_id=self.gpu_id) self.model_dir = os.path.join(self.config["model_save_dir"], self.config["save_name"]) for epoch in range(config["max_epoch"]): print(f"Epoch {epoch} starts !") print("-" * 80) if epoch > 0 and self.eval_engine.n_no_update == 0: # previous epoch have already obtained better result self.gmfengine.save_checkpoint(model_dir=self.model_dir) if self.eval_engine.n_no_update >= MAX_N_UPDATE: print( "Early stop criterion triggered, no performance update for {:} times" .format(MAX_N_UPDATE)) break train_loader = self.data self.gmfengine.train_an_epoch(epoch_id=epoch, train_loader=train_loader) print("Saving embeddings to: %s" % self.config["model_save_dir"]) user_embed, item_embed, v = ( self.gmfengine.model.user_memory.weight.detach().cpu(), self.gmfengine.model.item_memory.weight.detach().cpu(), self.gmfengine.model.v.weight.detach().cpu(), ) embed_dir = os.path.join(self.config["model_save_dir"], "pretain/embeddings") ensureDir(embed_dir) np.savez(embed_dir, user=user_embed, item=item_embed, v=v) self.config["run_time"] = self.monitor.stop() return np.array(user_embed), np.array(item_embed) def train(self): if self.config["pretrain"] == "gmf": user_embed, item_embed = self.train_gmf() model = self.cmnengine(self.config, user_embed, item_embed, self.data.item_users_list) self.monitor = Monitor(log_dir=self.config["run_dir"], delay=1, gpu_id=self.gpu_id) self.model_dir = os.path.join(self.config["model_save_dir"], self.config["save_name"]) for epoch in range(config["max_epoch"]): print(f"Epoch {epoch} starts !") print("-" * 80) if epoch > 0 and self.eval_engine.n_no_update == 0: # previous epoch have already obtained better result model.save_checkpoint(model_dir=self.model_dir) if self.eval_engine.n_no_update >= MAX_N_UPDATE: print( "Early stop criterion triggered, no performance update for {:} times" .format(MAX_N_UPDATE)) break train_loader = self.data model.train_an_epoch(epoch_id=epoch, train_loader=train_loader) self.eval_engine.train_eval(self.data.valid[0], self.data.test[0], model.model, epoch) self.config["run_time"] = self.monitor.stop() self.eval_engine.test_eval(self.data.test, model.model)
class NGCF_train(TrainEngine): """ An instance class from the TrainEngine base class """ def __init__(self, config): """Constructor Args: config (dict): All the parameters for the model """ self.config = config super(NGCF_train, self).__init__(self.config) self.load_dataset() self.build_data_loader() self.engine = NGCFEngine(self.config) def build_data_loader(self): # ToDo: Please define the directory to store the adjacent matrix plain_adj, norm_adj, mean_adj = self.dataset.get_adj_mat() norm_adj = sparse_mx_to_torch_sparse_tensor(norm_adj) self.config["norm_adj"] = norm_adj self.config["num_batch"] = self.dataset.n_train // config["batch_size"] + 1 self.config["n_users"] = self.dataset.n_users self.config["n_items"] = self.dataset.n_items def train(self): self.monitor = Monitor( log_dir=self.config["run_dir"], delay=1, gpu_id=self.gpu_id ) self.model_dir = os.path.join( self.config["model_save_dir"], self.config["save_name"] ) for epoch in range(config["max_epoch"]): print(f"Epoch {epoch} starts !") print("-" * 80) if epoch > 0 and self.eval_engine.n_no_update == 0: # previous epoch have already obtained better result self.engine.save_checkpoint(model_dir=self.model_dir) if self.eval_engine.n_no_update >= MAX_N_UPDATE: print( "Early stop criterion triggered, no performance update for {:} times".format( MAX_N_UPDATE ) ) break train_loader = self.dataset self.engine.train_an_epoch( epoch_id=epoch, train_loader=train_loader ) self.eval_engine.train_eval( self.dataset.valid[0], self.dataset.test[0], self.engine.model, epoch ) self.config["run_time"] = self.monitor.stop() def test(self): self.engine.resume_checkpoint(model_dir=self.model_dir) super(NGCF_train, self).test()
class VBCAR_train(TrainEngine): """An instance class from the TrainEngine base class.""" def __init__(self, config): """Initialize VBCAR_train Class. Args: config (dict): All the parameters for the model. """ self.config = config super(VBCAR_train, self).__init__(self.config) def load_dataset(self): """Load dataset.""" split_data = load_split_dataset(self.config) self.data = GroceryData(split_dataset=split_data, config=self.config) self.config["model"]["n_users"] = self.data.n_users self.config["model"]["n_items"] = self.data.n_items def train(self): """Train the model.""" self.load_dataset() self.train_data = self.data.sample_triple() self.config["model"]["alpha_step"] = ( 1 - self.config["model"]["alpha"]) / ( self.config["model"]["max_epoch"]) self.config["user_fea"] = self.data.user_feature self.config["item_fea"] = self.data.item_feature self.engine = VBCAREngine(self.config) self.engine.data = self.data assert hasattr(self, "engine"), "Please specify the exact model engine !" self.monitor = Monitor(log_dir=self.config["system"]["run_dir"], delay=1, gpu_id=self.gpu_id) print("Start training... ") epoch_bar = tqdm(range(self.config["model"]["max_epoch"]), file=sys.stdout) for epoch in epoch_bar: print(f"Epoch {epoch} starts !") print("-" * 80) if epoch > 0 and self.eval_engine.n_no_update == 0: # previous epoch have already obtained better result self.engine.save_checkpoint(model_dir=os.path.join( self.config["system"]["model_save_dir"], "model.cpk")) if self.eval_engine.n_no_update >= MAX_N_UPDATE: print( "Early stop criterion triggered, no performance update for {:} times" .format(MAX_N_UPDATE)) break data_loader = DataLoader( torch.LongTensor(self.train_data.to_numpy()).to( self.engine.device), batch_size=self.config["model"]["batch_size"], shuffle=True, drop_last=True, ) self.engine.train_an_epoch(data_loader, epoch_id=epoch) self.eval_engine.train_eval(self.data.valid[0], self.data.test[0], self.engine.model, epoch) # anneal alpha self.engine.model.alpha = min( self.config["model"]["alpha"] + math.exp(epoch - self.config["model"]["max_epoch"] + 20), 1, ) """Sets the learning rate to the initial LR decayed by 10 every 10 epochs""" lr = self.config["model"]["lr"] * (0.5**(epoch // 10)) for param_group in self.engine.optimizer.param_groups: param_group["lr"] = lr self.config["run_time"] = self.monitor.stop() return self.eval_engine.best_valid_performance