def save_example(): # configurations initialization config_dict = {'checkpoint_dir': '../saved'} config = Config(model='BPR', dataset='ml-100k', config_dict=config_dict) init_seed(config['seed'], config['reproducibility']) init_logger(config) # dataset filtering dataset = create_dataset(config) dataset.save('../saved/') # dataset splitting train_data, valid_data, test_data = data_preparation(config, dataset) save_split_dataloaders(config, dataloaders=(train_data, valid_data, test_data)) model = get_model(config['model'])(config, train_data).to(config['device']) # trainer loading and initialization trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model) # model training # the best model will be saved in here best_valid_score, best_valid_result = trainer.fit( train_data, valid_data, saved=True, show_progress=config['show_progress'])
def load_data_and_model(model_file): r"""Load filtered dataset, split dataloaders and saved model. Args: model_file (str): The path of saved model file. Returns: tuple: - config (Config): An instance object of Config, which record parameter information in :attr:`model_file`. - model (AbstractRecommender): The model load from :attr:`model_file`. - dataset (Dataset): The filtered dataset. - train_data (AbstractDataLoader): The dataloader for training. - valid_data (AbstractDataLoader): The dataloader for validation. - test_data (AbstractDataLoader): The dataloader for testing. """ checkpoint = torch.load(model_file) config = checkpoint['config'] init_seed(config['seed'], config['reproducibility']) init_logger(config) logger = getLogger() logger.info(config) dataset = create_dataset(config) logger.info(dataset) train_data, valid_data, test_data = data_preparation(config, dataset) init_seed(config['seed'], config['reproducibility']) model = get_model(config['model'])(config, train_data.dataset).to(config['device']) model.load_state_dict(checkpoint['state_dict']) model.load_other_parameter(checkpoint.get('other_parameter')) return config, model, dataset, train_data, valid_data, test_data
def run_recbole(model=None, dataset=None, config_file_list=None, config_dict=None, saved=True): r""" A fast running api, which includes the complete process of training and testing a model on a specified dataset Args: model (str): model name dataset (str): dataset name config_file_list (list): config files used to modify experiment parameters config_dict (dict): parameters dictionary used to modify experiment parameters saved (bool): whether to save the model """ # configurations initialization config = Config(model=model, dataset=dataset, config_file_list=config_file_list, config_dict=config_dict) init_seed(config['seed'], config['reproducibility']) # logger initialization init_logger(config) logger = getLogger() logger.info(config) # dataset filtering dataset = create_dataset(config) logger.info(dataset) # dataset splitting train_data, valid_data, test_data = data_preparation(config, dataset) print(train_data.dataset.item_feat) print(valid_data.dataset.item_feat) # model loading and initialization model = get_model(config['model'])(config, train_data).to(config['device']) logger.info(model) # trainer loading and initialization trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model) # model training with profiler.profile(enabled=config["monitor"], with_stack=True, profile_memory=True, use_cuda=True) as prof: best_valid_score, best_valid_result = trainer.fit( train_data, valid_data, saved=saved, show_progress=config['show_progress'] ) if prof is not None: print(prof.key_averages(group_by_stack_n=5).table(sort_by='self_cpu_time_total')) # model evaluation with profiler.profile(enabled=config["monitor_eval"], with_stack=True, profile_memory=True, use_cuda=True) as prof: test_result = trainer.evaluate(test_data, load_best_model=saved, show_progress=config['show_progress'], cold_warm_distinct_eval=True) if prof is not None: print(prof.key_averages(group_by_stack_n=5).table(sort_by='self_cpu_time_total')) logger.info(set_color('best valid ', 'yellow') + f': {best_valid_result}') logger.info(set_color('test result', 'yellow') + f': {test_result}') return { 'best_valid_score': best_valid_score, 'valid_score_bigger': config['valid_metric_bigger'], 'best_valid_result': best_valid_result, 'test_result': test_result }
def run_recbole(model=None, dataset=None, config_file_list=None, config_dict=None, saved=True): r""" A fast running api, which includes the complete process of training and testing a model on a specified dataset Args: model (str): model name dataset (str): dataset name config_file_list (list): config files used to modify experiment parameters config_dict (dict): parameters dictionary used to modify experiment parameters saved (bool): whether to save the model """ # configurations initialization config = Config(model=model, dataset=dataset, config_file_list=config_file_list, config_dict=config_dict) init_seed(config['seed'], config['reproducibility']) # logger initialization init_logger(config) logger = getLogger() logger.info(config) # dataset filtering dataset = create_dataset(config) logger.info(dataset) # dataset splitting train_data, valid_data, test_data = data_preparation(config, dataset) # model loading and initialization model = get_model(config['model'])(config, train_data).to(config['device']) logger.info(model) # trainer loading and initialization trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model) # model training best_valid_score, best_valid_result = trainer.fit(train_data, valid_data, saved=saved) # model evaluation test_result = trainer.evaluate(test_data, load_best_model=saved) logger.info('best valid result: {}'.format(best_valid_result)) logger.info('test result: {}'.format(test_result)) return { 'best_valid_score': best_valid_score, 'valid_score_bigger': config['valid_metric_bigger'], 'best_valid_result': best_valid_result, 'test_result': test_result }
def run_recbole(model=None, dataset=None, config_file_list=None, config_dict=None, saved=True): r""" A fast running api, which includes the complete process of training and testing a model on a specified dataset Args: model (str, optional): Model name. Defaults to ``None``. dataset (str, optional): Dataset name. Defaults to ``None``. config_file_list (list, optional): Config files used to modify experiment parameters. Defaults to ``None``. config_dict (dict, optional): Parameters dictionary used to modify experiment parameters. Defaults to ``None``. saved (bool, optional): Whether to save the model. Defaults to ``True``. """ # configurations initialization config = Config(model=model, dataset=dataset, config_file_list=config_file_list, config_dict=config_dict) init_seed(config['seed'], config['reproducibility']) # logger initialization init_logger(config) logger = getLogger() logger.info(config) # dataset filtering dataset = create_dataset(config) if config['save_dataset']: dataset.save() logger.info(dataset) # dataset splitting train_data, valid_data, test_data = data_preparation(config, dataset) if config['save_dataloaders']: save_split_dataloaders(config, dataloaders=(train_data, valid_data, test_data)) # model loading and initialization model = get_model(config['model'])(config, train_data.dataset).to(config['device']) logger.info(model) # trainer loading and initialization trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model) # model training best_valid_score, best_valid_result = trainer.fit( train_data, valid_data, saved=saved, show_progress=config['show_progress'] ) # model evaluation test_result = trainer.evaluate(test_data, load_best_model=saved, show_progress=config['show_progress']) logger.info(set_color('best valid ', 'yellow') + f': {best_valid_result}') logger.info(set_color('test result', 'yellow') + f': {test_result}') return { 'best_valid_score': best_valid_score, 'valid_score_bigger': config['valid_metric_bigger'], 'best_valid_result': best_valid_result, 'test_result': test_result }
def load_data_and_model(model_file, dataset_file=None, dataloader_file=None): r"""Load filtered dataset, split dataloaders and saved model. Args: model_file (str): The path of saved model file. dataset_file (str, optional): The path of filtered dataset. Defaults to ``None``. dataloader_file (str, optional): The path of split dataloaders. Defaults to ``None``. Note: The :attr:`dataset` will be loaded or created according to the following strategy: If :attr:`dataset_file` is not ``None``, the :attr:`dataset` will be loaded from :attr:`dataset_file`. If :attr:`dataset_file` is ``None`` and :attr:`dataloader_file` is ``None``, the :attr:`dataset` will be created according to :attr:`config`. If :attr:`dataset_file` is ``None`` and :attr:`dataloader_file` is not ``None``, the :attr:`dataset` will neither be loaded or created. The :attr:`dataloader` will be loaded or created according to the following strategy: If :attr:`dataloader_file` is not ``None``, the :attr:`dataloader` will be loaded from :attr:`dataloader_file`. If :attr:`dataloader_file` is ``None``, the :attr:`dataloader` will be created according to :attr:`config`. Returns: tuple: - config (Config): An instance object of Config, which record parameter information in :attr:`model_file`. - model (AbstractRecommender): The model load from :attr:`model_file`. - dataset (Dataset): The filtered dataset. - train_data (AbstractDataLoader): The dataloader for training. - valid_data (AbstractDataLoader): The dataloader for validation. - test_data (AbstractDataLoader): The dataloader for testing. """ checkpoint = torch.load(model_file) config = checkpoint['config'] init_logger(config) dataset = None if dataset_file: with open(dataset_file, 'rb') as f: dataset = pickle.load(f) if dataloader_file: train_data, valid_data, test_data = load_split_dataloaders(dataloader_file) else: if dataset is None: dataset = create_dataset(config) train_data, valid_data, test_data = data_preparation(config, dataset) model = get_model(config['model'])(config, train_data.dataset).to(config['device']) model.load_state_dict(checkpoint['state_dict']) model.load_other_parameter(checkpoint.get('other_parameter')) return config, model, dataset, train_data, valid_data, test_data
def load_example(): # configurations initialization config_dict = {'checkpoint_dir': '../saved'} config = Config(model='BPR', dataset='ml-100k', config_dict=config_dict) init_seed(config['seed'], config['reproducibility']) init_logger(config) logger = getLogger() with open('../saved/ml-100k-dataset.pth', 'rb') as f: # You can use your filtered data path here. dataset = pickle.load(f) train_data, valid_data, test_data = load_split_dataloaders( '../saved/ml-100k-for-BPR-dataloader.pth') # You can use your split data path here. model = get_model(config['model'])(config, train_data).to(config['device']) checkpoint = torch.load('../saved/BPR-Mar-20-2021_17-11-05.pth' ) # Here you can replace it by your model path. model.load_state_dict(checkpoint['state_dict']) logger.info(model) logger.info(train_data.dataset) logger.info(valid_data.dataset) logger.info(test_data.dataset)
'load_col': None, 'neg_sampling': None, 'benchmark_filename': ['train', 'test'], 'alias_of_item_id': ['item_id_list'], 'topk': [20], 'metrics': ['Recall', 'MRR'], 'valid_metric': 'MRR@20' } config = Config(model=args.model, dataset=f'{args.dataset}', config_dict=config_dict) init_seed(config['seed'], config['reproducibility']) # logger initialization init_logger(config) logger = getLogger() logger.info(args) logger.info(config) # dataset filtering dataset = create_dataset(config) logger.info(dataset) # dataset splitting train_dataset, test_dataset = dataset.build() if args.validation: train_dataset.shuffle() new_train_dataset, new_test_dataset = train_dataset.split_by_ratio( [1 - args.valid_portion, args.valid_portion])
def run_trial(model_name , dataset_name , hp_config = None , save_flag = False): if not hp_config: hp_config = {} tuning = False else: tuning = True commons.init_seeds() verbose = True verbose = (not tuning) model_class = statics.model_name_map[model_name] try: default_config = model_class.default_params except AttributeError: default_config = {} assert model_name in statics.recbole_models default_config.update(statics.datasets_params[dataset_name]) default_config.update(hp_config) config = Config(model=model_class, dataset=dataset_name, config_dict=default_config) init_seed(config['seed'], config['reproducibility']) init_logger(config) logger = logging.getLogger() # logger initialization if verbose: logger.info(config) # dataset filtering dataset = create_dataset(config) train_data, valid_data, test_data = data_preparation(config, dataset) train_data = add_graph(train_data) if verbose: logger.info(dataset) model = model_class(config, train_data).to(commons.device) trainer = utils.get_trainer(config)(config, model) best_valid_score, best_valid_result = trainer.fit(train_data, valid_data , verbose= verbose , show_progress=verbose) test_result = trainer.evaluate(test_data) if verbose: logger.info(set_color('best valid ', 'yellow') + f': {best_valid_result}') logger.info(set_color('test result', 'yellow') + f': {test_result}') metric = str.lower(config['valid_metric']) if save_flag: os.makedirs(os.path.join("bestmodels" , dataset_name , str(config["topk"])) , exist_ok=True) save_path = os.path.join("bestmodels" , dataset_name , str(config["topk"]) , "{}.pth".format(model_name)) else: save_path = None if save_path: shutil.copyfile(trainer.saved_model_file , save_path) return { 'metric' : config['valid_metric'], 'best_valid_score': best_valid_score, 'valid_score_bigger': config['valid_metric_bigger'], 'best_valid_result': best_valid_result, 'test_score': test_result[metric] }
def run_evaluation(model_name, dataset_name, model_path): global_dict[model_name] = OrderedDict() for metric in all_metrics: global_dict[model_name][metric] = OrderedDict() kvals = [10, 20, 30] dataset_initialized = False train_data = None adj_sets = None for K in kvals: commons.init_seeds() model_class = statics.model_name_map[model_name] model_path = os.path.join("bestmodels", dataset_name, str(K), "{}.pth".format(model_name)) loaded_file = torch.load(model_path) config = loaded_file['config'] config['data_path'] = os.path.join('dataset', dataset_name) config['topk'] = K config['valid_metric'] = 'Recall@{}'.format(K) config['eval_batch_size'] = 500000 init_seed(config['seed'], config['reproducibility']) init_logger(config) logger = logging.getLogger() if not dataset_initialized: # dataset filtering dataset = create_dataset(config) train_data, valid_data, test_data = data_preparation( config, dataset) train_data = add_graph(train_data) item_degrees = train_data.graph.in_degrees()[train_data.num_users:] adj_sets = construct_sets(train_data) dataset_initialized = True assert adj_sets assert train_data model = model_class(config, train_data).to(commons.device) trainer = utils.get_trainer(config)(config, model) test_result = trainer.evaluate(test_data, load_best_model=True, model_file=model_path) custom_evaluator = CustomEvaluator(trainer, config, config['metrics']) novelty, diversity = custom_evaluator.div_nov(train_data.num_users, train_data.num_items, item_degrees, test_data, adj_sets) novelty = round(novelty, 4) diversity = round(diversity, 4) for metric in all_metrics: if metric not in ['novelty', 'diversity']: global_dict[model_name][metric][K] = test_result[ "{}@{}".format(metric, K)] global_dict[model_name]['novelty'][K] = novelty global_dict[model_name]['diversity'][K] = diversity