Exemple #1
0
def save_example():
    # configurations initialization
    config_dict = {'checkpoint_dir': '../saved'}
    config = Config(model='BPR', dataset='ml-100k', config_dict=config_dict)
    init_seed(config['seed'], config['reproducibility'])
    init_logger(config)

    # dataset filtering
    dataset = create_dataset(config)
    dataset.save('../saved/')

    # dataset splitting
    train_data, valid_data, test_data = data_preparation(config, dataset)
    save_split_dataloaders(config,
                           dataloaders=(train_data, valid_data, test_data))

    model = get_model(config['model'])(config, train_data).to(config['device'])

    # trainer loading and initialization
    trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

    # model training
    # the best model will be saved in here
    best_valid_score, best_valid_result = trainer.fit(
        train_data,
        valid_data,
        saved=True,
        show_progress=config['show_progress'])
def load_data_and_model(model_file):
    r"""Load filtered dataset, split dataloaders and saved model.

    Args:
        model_file (str): The path of saved model file.

    Returns:
        tuple:
            - config (Config): An instance object of Config, which record parameter information in :attr:`model_file`.
            - model (AbstractRecommender): The model load from :attr:`model_file`.
            - dataset (Dataset): The filtered dataset.
            - train_data (AbstractDataLoader): The dataloader for training.
            - valid_data (AbstractDataLoader): The dataloader for validation.
            - test_data (AbstractDataLoader): The dataloader for testing.
    """
    checkpoint = torch.load(model_file)
    config = checkpoint['config']
    init_seed(config['seed'], config['reproducibility'])
    init_logger(config)
    logger = getLogger()
    logger.info(config)

    dataset = create_dataset(config)
    logger.info(dataset)
    train_data, valid_data, test_data = data_preparation(config, dataset)

    init_seed(config['seed'], config['reproducibility'])
    model = get_model(config['model'])(config,
                                       train_data.dataset).to(config['device'])
    model.load_state_dict(checkpoint['state_dict'])
    model.load_other_parameter(checkpoint.get('other_parameter'))

    return config, model, dataset, train_data, valid_data, test_data
Exemple #3
0
def run_recbole(model=None, dataset=None, config_file_list=None, config_dict=None, saved=True):
    r""" A fast running api, which includes the complete process of
    training and testing a model on a specified dataset

    Args:
        model (str): model name
        dataset (str): dataset name
        config_file_list (list): config files used to modify experiment parameters
        config_dict (dict): parameters dictionary used to modify experiment parameters
        saved (bool): whether to save the model
    """
    # configurations initialization
    config = Config(model=model, dataset=dataset, config_file_list=config_file_list, config_dict=config_dict)
    init_seed(config['seed'], config['reproducibility'])

    # logger initialization
    init_logger(config)
    logger = getLogger()

    logger.info(config)

    # dataset filtering
    dataset = create_dataset(config)
    logger.info(dataset)

    # dataset splitting
    train_data, valid_data, test_data = data_preparation(config, dataset)
    print(train_data.dataset.item_feat)
    print(valid_data.dataset.item_feat)

    # model loading and initialization
    model = get_model(config['model'])(config, train_data).to(config['device'])
    logger.info(model)

    # trainer loading and initialization
    trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

    # model training
    with profiler.profile(enabled=config["monitor"], with_stack=True, profile_memory=True, use_cuda=True) as prof:
        best_valid_score, best_valid_result = trainer.fit(
            train_data, valid_data, saved=saved, show_progress=config['show_progress']
        )
    if prof is not None:
        print(prof.key_averages(group_by_stack_n=5).table(sort_by='self_cpu_time_total'))

    # model evaluation
    with profiler.profile(enabled=config["monitor_eval"], with_stack=True, profile_memory=True, use_cuda=True) as prof:
        test_result = trainer.evaluate(test_data, load_best_model=saved, show_progress=config['show_progress'], cold_warm_distinct_eval=True)
    if prof is not None:
        print(prof.key_averages(group_by_stack_n=5).table(sort_by='self_cpu_time_total'))

    logger.info(set_color('best valid ', 'yellow') + f': {best_valid_result}')
    logger.info(set_color('test result', 'yellow') + f': {test_result}')

    return {
        'best_valid_score': best_valid_score,
        'valid_score_bigger': config['valid_metric_bigger'],
        'best_valid_result': best_valid_result,
        'test_result': test_result
    }
Exemple #4
0
def run_recbole(model=None,
                dataset=None,
                config_file_list=None,
                config_dict=None,
                saved=True):
    r""" A fast running api, which includes the complete process of
    training and testing a model on a specified dataset

    Args:
        model (str): model name
        dataset (str): dataset name
        config_file_list (list): config files used to modify experiment parameters
        config_dict (dict): parameters dictionary used to modify experiment parameters
        saved (bool): whether to save the model
    """
    # configurations initialization
    config = Config(model=model,
                    dataset=dataset,
                    config_file_list=config_file_list,
                    config_dict=config_dict)
    init_seed(config['seed'], config['reproducibility'])

    # logger initialization
    init_logger(config)
    logger = getLogger()

    logger.info(config)

    # dataset filtering
    dataset = create_dataset(config)
    logger.info(dataset)

    # dataset splitting
    train_data, valid_data, test_data = data_preparation(config, dataset)

    # model loading and initialization
    model = get_model(config['model'])(config, train_data).to(config['device'])
    logger.info(model)

    # trainer loading and initialization
    trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

    # model training
    best_valid_score, best_valid_result = trainer.fit(train_data,
                                                      valid_data,
                                                      saved=saved)

    # model evaluation
    test_result = trainer.evaluate(test_data, load_best_model=saved)

    logger.info('best valid result: {}'.format(best_valid_result))
    logger.info('test result: {}'.format(test_result))

    return {
        'best_valid_score': best_valid_score,
        'valid_score_bigger': config['valid_metric_bigger'],
        'best_valid_result': best_valid_result,
        'test_result': test_result
    }
Exemple #5
0
def run_recbole(model=None, dataset=None, config_file_list=None, config_dict=None, saved=True):
    r""" A fast running api, which includes the complete process of
    training and testing a model on a specified dataset

    Args:
        model (str, optional): Model name. Defaults to ``None``.
        dataset (str, optional): Dataset name. Defaults to ``None``.
        config_file_list (list, optional): Config files used to modify experiment parameters. Defaults to ``None``.
        config_dict (dict, optional): Parameters dictionary used to modify experiment parameters. Defaults to ``None``.
        saved (bool, optional): Whether to save the model. Defaults to ``True``.
    """
    # configurations initialization
    config = Config(model=model, dataset=dataset, config_file_list=config_file_list, config_dict=config_dict)
    init_seed(config['seed'], config['reproducibility'])
    # logger initialization
    init_logger(config)
    logger = getLogger()

    logger.info(config)

    # dataset filtering
    dataset = create_dataset(config)
    if config['save_dataset']:
        dataset.save()
    logger.info(dataset)

    # dataset splitting
    train_data, valid_data, test_data = data_preparation(config, dataset)
    if config['save_dataloaders']:
        save_split_dataloaders(config, dataloaders=(train_data, valid_data, test_data))

    # model loading and initialization
    model = get_model(config['model'])(config, train_data.dataset).to(config['device'])
    logger.info(model)

    # trainer loading and initialization
    trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

    # model training
    best_valid_score, best_valid_result = trainer.fit(
        train_data, valid_data, saved=saved, show_progress=config['show_progress']
    )

    # model evaluation
    test_result = trainer.evaluate(test_data, load_best_model=saved, show_progress=config['show_progress'])

    logger.info(set_color('best valid ', 'yellow') + f': {best_valid_result}')
    logger.info(set_color('test result', 'yellow') + f': {test_result}')

    return {
        'best_valid_score': best_valid_score,
        'valid_score_bigger': config['valid_metric_bigger'],
        'best_valid_result': best_valid_result,
        'test_result': test_result
    }
Exemple #6
0
def load_data_and_model(model_file, dataset_file=None, dataloader_file=None):
    r"""Load filtered dataset, split dataloaders and saved model.

    Args:
        model_file (str): The path of saved model file.
        dataset_file (str, optional): The path of filtered dataset. Defaults to ``None``.
        dataloader_file (str, optional): The path of split dataloaders. Defaults to ``None``.

    Note:
        The :attr:`dataset` will be loaded or created according to the following strategy:
        If :attr:`dataset_file` is not ``None``, the :attr:`dataset` will be loaded from :attr:`dataset_file`.
        If :attr:`dataset_file` is ``None`` and :attr:`dataloader_file` is ``None``,
        the :attr:`dataset` will be created according to :attr:`config`.
        If :attr:`dataset_file` is ``None`` and :attr:`dataloader_file` is not ``None``,
        the :attr:`dataset` will neither be loaded or created.

        The :attr:`dataloader` will be loaded or created according to the following strategy:
        If :attr:`dataloader_file` is not ``None``, the :attr:`dataloader` will be loaded from :attr:`dataloader_file`.
        If :attr:`dataloader_file` is ``None``, the :attr:`dataloader` will be created according to :attr:`config`.

    Returns:
        tuple:
            - config (Config): An instance object of Config, which record parameter information in :attr:`model_file`.
            - model (AbstractRecommender): The model load from :attr:`model_file`.
            - dataset (Dataset): The filtered dataset.
            - train_data (AbstractDataLoader): The dataloader for training.
            - valid_data (AbstractDataLoader): The dataloader for validation.
            - test_data (AbstractDataLoader): The dataloader for testing.
    """
    checkpoint = torch.load(model_file)
    config = checkpoint['config']
    init_logger(config)

    dataset = None
    if dataset_file:
        with open(dataset_file, 'rb') as f:
            dataset = pickle.load(f)

    if dataloader_file:
        train_data, valid_data, test_data = load_split_dataloaders(dataloader_file)
    else:
        if dataset is None:
            dataset = create_dataset(config)
        train_data, valid_data, test_data = data_preparation(config, dataset)

    model = get_model(config['model'])(config, train_data.dataset).to(config['device'])
    model.load_state_dict(checkpoint['state_dict'])
    model.load_other_parameter(checkpoint.get('other_parameter'))

    return config, model, dataset, train_data, valid_data, test_data
Exemple #7
0
def load_example():
    # configurations initialization
    config_dict = {'checkpoint_dir': '../saved'}
    config = Config(model='BPR', dataset='ml-100k', config_dict=config_dict)
    init_seed(config['seed'], config['reproducibility'])
    init_logger(config)
    logger = getLogger()

    with open('../saved/ml-100k-dataset.pth',
              'rb') as f:  # You can use your filtered data path here.
        dataset = pickle.load(f)

    train_data, valid_data, test_data = load_split_dataloaders(
        '../saved/ml-100k-for-BPR-dataloader.pth')
    # You can use your split data path here.

    model = get_model(config['model'])(config, train_data).to(config['device'])
    checkpoint = torch.load('../saved/BPR-Mar-20-2021_17-11-05.pth'
                            )  # Here you can replace it by your model path.
    model.load_state_dict(checkpoint['state_dict'])
    logger.info(model)
    logger.info(train_data.dataset)
    logger.info(valid_data.dataset)
    logger.info(test_data.dataset)
        'load_col': None,
        'neg_sampling': None,
        'benchmark_filename': ['train', 'test'],
        'alias_of_item_id': ['item_id_list'],
        'topk': [20],
        'metrics': ['Recall', 'MRR'],
        'valid_metric': 'MRR@20'
    }

    config = Config(model=args.model,
                    dataset=f'{args.dataset}',
                    config_dict=config_dict)
    init_seed(config['seed'], config['reproducibility'])

    # logger initialization
    init_logger(config)
    logger = getLogger()

    logger.info(args)
    logger.info(config)

    # dataset filtering
    dataset = create_dataset(config)
    logger.info(dataset)

    # dataset splitting
    train_dataset, test_dataset = dataset.build()
    if args.validation:
        train_dataset.shuffle()
        new_train_dataset, new_test_dataset = train_dataset.split_by_ratio(
            [1 - args.valid_portion, args.valid_portion])
Exemple #9
0
def run_trial(model_name , dataset_name , hp_config = None , save_flag = False):

    if not hp_config:
        hp_config = {}
        tuning = False
    else:
        tuning = True

    commons.init_seeds()
    verbose = True
    verbose = (not tuning)
    model_class = statics.model_name_map[model_name]
    try:
        default_config = model_class.default_params
    except AttributeError:
        default_config = {}
        assert model_name in statics.recbole_models

    default_config.update(statics.datasets_params[dataset_name])
    default_config.update(hp_config)

    config = Config(model=model_class, dataset=dataset_name, config_dict=default_config)
    init_seed(config['seed'], config['reproducibility'])

    init_logger(config)
    logger = logging.getLogger()

    # logger initialization
    if verbose:
        logger.info(config)

    # dataset filtering
    dataset = create_dataset(config)
    train_data, valid_data, test_data = data_preparation(config, dataset)
    train_data = add_graph(train_data)

    if verbose:
        logger.info(dataset)

    model = model_class(config, train_data).to(commons.device)
    trainer = utils.get_trainer(config)(config, model)

    best_valid_score, best_valid_result = trainer.fit(train_data, valid_data , verbose= verbose , show_progress=verbose)
    test_result = trainer.evaluate(test_data)

    if verbose:
        logger.info(set_color('best valid ', 'yellow') + f': {best_valid_result}')
        logger.info(set_color('test result', 'yellow') + f': {test_result}')

    metric = str.lower(config['valid_metric'])

    if save_flag:
        os.makedirs(os.path.join("bestmodels" , dataset_name , str(config["topk"])) , exist_ok=True)
        save_path = os.path.join("bestmodels" , dataset_name , str(config["topk"]) , "{}.pth".format(model_name))
    else:
        save_path = None

    if save_path:
        shutil.copyfile(trainer.saved_model_file , save_path)

    return {
        'metric' : config['valid_metric'],
        'best_valid_score': best_valid_score,
        'valid_score_bigger': config['valid_metric_bigger'],
        'best_valid_result': best_valid_result,
        'test_score': test_result[metric]
    }
Exemple #10
0
def run_evaluation(model_name, dataset_name, model_path):

    global_dict[model_name] = OrderedDict()
    for metric in all_metrics:
        global_dict[model_name][metric] = OrderedDict()

    kvals = [10, 20, 30]

    dataset_initialized = False
    train_data = None
    adj_sets = None

    for K in kvals:

        commons.init_seeds()

        model_class = statics.model_name_map[model_name]
        model_path = os.path.join("bestmodels", dataset_name, str(K),
                                  "{}.pth".format(model_name))
        loaded_file = torch.load(model_path)
        config = loaded_file['config']
        config['data_path'] = os.path.join('dataset', dataset_name)
        config['topk'] = K
        config['valid_metric'] = 'Recall@{}'.format(K)
        config['eval_batch_size'] = 500000
        init_seed(config['seed'], config['reproducibility'])

        init_logger(config)
        logger = logging.getLogger()

        if not dataset_initialized:
            # dataset filtering
            dataset = create_dataset(config)
            train_data, valid_data, test_data = data_preparation(
                config, dataset)
            train_data = add_graph(train_data)
            item_degrees = train_data.graph.in_degrees()[train_data.num_users:]
            adj_sets = construct_sets(train_data)
            dataset_initialized = True

        assert adj_sets
        assert train_data

        model = model_class(config, train_data).to(commons.device)
        trainer = utils.get_trainer(config)(config, model)

        test_result = trainer.evaluate(test_data,
                                       load_best_model=True,
                                       model_file=model_path)
        custom_evaluator = CustomEvaluator(trainer, config, config['metrics'])
        novelty, diversity = custom_evaluator.div_nov(train_data.num_users,
                                                      train_data.num_items,
                                                      item_degrees, test_data,
                                                      adj_sets)
        novelty = round(novelty, 4)
        diversity = round(diversity, 4)
        for metric in all_metrics:
            if metric not in ['novelty', 'diversity']:
                global_dict[model_name][metric][K] = test_result[
                    "{}@{}".format(metric, K)]
        global_dict[model_name]['novelty'][K] = novelty
        global_dict[model_name]['diversity'][K] = diversity