Ejemplo n.º 1
0
    def _set_default_parameters(self):

        self.final_config_dict['dataset'] = self.dataset
        self.final_config_dict['model'] = self.model
        if self.dataset == 'ml-100k':
            current_path = os.path.dirname(os.path.realpath(__file__))
            self.final_config_dict['data_path'] = os.path.join(
                current_path, '../dataset_example/' + self.dataset)
        else:
            self.final_config_dict['data_path'] = os.path.join(
                self.final_config_dict['data_path'], self.dataset)

        if hasattr(get_model(self.model), 'input_type'):
            self.final_config_dict['MODEL_INPUT_TYPE'] = get_model(
                self.model).input_type
        elif 'loss_type' in self.final_config_dict:
            if self.final_config_dict['loss_type'] in ['CE']:
                self.final_config_dict[
                    'MODEL_INPUT_TYPE'] = InputType.POINTWISE
            elif self.final_config_dict['loss_type'] in ['BPR']:
                self.final_config_dict['MODEL_INPUT_TYPE'] = InputType.PAIRWISE
        else:
            raise ValueError('Either Model has attr \'input_type\','
                             'or arg \'loss_type\' should exist in config.')

        eval_type = None
        for metric in self.final_config_dict['metrics']:
            if metric.lower() in loss_metrics:
                if eval_type is not None and eval_type == EvaluatorType.RANKING:
                    raise RuntimeError(
                        'Ranking metrics and other metrics can not be used at the same time.'
                    )
                else:
                    eval_type = EvaluatorType.INDIVIDUAL
            if metric.lower() in topk_metrics:
                if eval_type is not None and eval_type == EvaluatorType.INDIVIDUAL:
                    raise RuntimeError(
                        'Ranking metrics and other metrics can not be used at the same time.'
                    )
                else:
                    eval_type = EvaluatorType.RANKING
        self.final_config_dict['eval_type'] = eval_type

        smaller_metric = ['rmse', 'mae', 'logloss']
        valid_metric = self.final_config_dict['valid_metric'].split('@')[0]
        self.final_config_dict[
            'valid_metric_bigger'] = False if valid_metric in smaller_metric else True

        if 'additional_feat_suffix' in self.final_config_dict:
            ad_suf = self.final_config_dict['additional_feat_suffix']
            if isinstance(ad_suf, str):
                self.final_config_dict['additional_feat_suffix'] = [ad_suf]
Ejemplo n.º 2
0
def objective_function(config_dict=None, config_file_list=None, saved=True):
    r""" The default objective_function used in HyperTuning

    Args:
        config_dict (dict): parameters dictionary used to modify experiment parameters
        config_file_list (list): config files used to modify experiment parameters
        saved (bool): whether to save the model
    """

    config = Config(config_dict=config_dict, config_file_list=config_file_list)
    init_seed(config['seed'], config['reproducibility'])
    logging.basicConfig(level=logging.ERROR)
    dataset = create_dataset(config)
    train_data, valid_data, test_data = data_preparation(config, dataset)
    model = get_model(config['model'])(config, train_data).to(config['device'])
    trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)
    best_valid_score, best_valid_result = trainer.fit(train_data, valid_data, verbose=False, saved=saved)
    test_result = trainer.evaluate(test_data, load_best_model=saved)

    return {
        'best_valid_score': best_valid_score,
        'valid_score_bigger': config['valid_metric_bigger'],
        'best_valid_result': best_valid_result,
        'test_result': test_result
    }
Ejemplo n.º 3
0
def save_example():
    # configurations initialization
    config_dict = {'checkpoint_dir': '../saved'}
    config = Config(model='BPR', dataset='ml-100k', config_dict=config_dict)
    init_seed(config['seed'], config['reproducibility'])
    init_logger(config)

    # dataset filtering
    dataset = create_dataset(config)
    dataset.save('../saved/')

    # dataset splitting
    train_data, valid_data, test_data = data_preparation(config, dataset)
    save_split_dataloaders(config,
                           dataloaders=(train_data, valid_data, test_data))

    model = get_model(config['model'])(config, train_data).to(config['device'])

    # trainer loading and initialization
    trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

    # model training
    # the best model will be saved in here
    best_valid_score, best_valid_result = trainer.fit(
        train_data,
        valid_data,
        saved=True,
        show_progress=config['show_progress'])
Ejemplo n.º 4
0
    def _get_model_and_dataset(self, model, dataset):

        if model is None:
            try:
                model = self.external_config_dict['model']
            except KeyError:
                raise KeyError(
                    'model need to be specified in at least one of the these ways: '
                    '[model variable, config file, config dict, command line] '
                )
        if not isinstance(model, str):
            final_model_class = model
            final_model = model.__name__
        else:
            final_model = model
            final_model_class = get_model(final_model)

        if dataset is None:
            try:
                final_dataset = self.external_config_dict['dataset']
            except KeyError:
                raise KeyError(
                    'dataset need to be specified in at least one of the these ways: '
                    '[dataset variable, config file, config dict, command line] '
                )
        else:
            final_dataset = dataset

        return final_model, final_model_class, final_dataset
Ejemplo n.º 5
0
def load_data_and_model(model_file):
    r"""Load filtered dataset, split dataloaders and saved model.

    Args:
        model_file (str): The path of saved model file.

    Returns:
        tuple:
            - config (Config): An instance object of Config, which record parameter information in :attr:`model_file`.
            - model (AbstractRecommender): The model load from :attr:`model_file`.
            - dataset (Dataset): The filtered dataset.
            - train_data (AbstractDataLoader): The dataloader for training.
            - valid_data (AbstractDataLoader): The dataloader for validation.
            - test_data (AbstractDataLoader): The dataloader for testing.
    """
    checkpoint = torch.load(model_file)
    config = checkpoint['config']
    init_seed(config['seed'], config['reproducibility'])
    init_logger(config)
    logger = getLogger()
    logger.info(config)

    dataset = create_dataset(config)
    logger.info(dataset)
    train_data, valid_data, test_data = data_preparation(config, dataset)

    init_seed(config['seed'], config['reproducibility'])
    model = get_model(config['model'])(config,
                                       train_data.dataset).to(config['device'])
    model.load_state_dict(checkpoint['state_dict'])
    model.load_other_parameter(checkpoint.get('other_parameter'))

    return config, model, dataset, train_data, valid_data, test_data
Ejemplo n.º 6
0
def run_recbole(model=None, dataset=None, config_file_list=None, config_dict=None, saved=True):
    r""" A fast running api, which includes the complete process of
    training and testing a model on a specified dataset

    Args:
        model (str): model name
        dataset (str): dataset name
        config_file_list (list): config files used to modify experiment parameters
        config_dict (dict): parameters dictionary used to modify experiment parameters
        saved (bool): whether to save the model
    """
    # configurations initialization
    config = Config(model=model, dataset=dataset, config_file_list=config_file_list, config_dict=config_dict)
    init_seed(config['seed'], config['reproducibility'])

    # logger initialization
    init_logger(config)
    logger = getLogger()

    logger.info(config)

    # dataset filtering
    dataset = create_dataset(config)
    logger.info(dataset)

    # dataset splitting
    train_data, valid_data, test_data = data_preparation(config, dataset)
    print(train_data.dataset.item_feat)
    print(valid_data.dataset.item_feat)

    # model loading and initialization
    model = get_model(config['model'])(config, train_data).to(config['device'])
    logger.info(model)

    # trainer loading and initialization
    trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

    # model training
    with profiler.profile(enabled=config["monitor"], with_stack=True, profile_memory=True, use_cuda=True) as prof:
        best_valid_score, best_valid_result = trainer.fit(
            train_data, valid_data, saved=saved, show_progress=config['show_progress']
        )
    if prof is not None:
        print(prof.key_averages(group_by_stack_n=5).table(sort_by='self_cpu_time_total'))

    # model evaluation
    with profiler.profile(enabled=config["monitor_eval"], with_stack=True, profile_memory=True, use_cuda=True) as prof:
        test_result = trainer.evaluate(test_data, load_best_model=saved, show_progress=config['show_progress'], cold_warm_distinct_eval=True)
    if prof is not None:
        print(prof.key_averages(group_by_stack_n=5).table(sort_by='self_cpu_time_total'))

    logger.info(set_color('best valid ', 'yellow') + f': {best_valid_result}')
    logger.info(set_color('test result', 'yellow') + f': {test_result}')

    return {
        'best_valid_score': best_valid_score,
        'valid_score_bigger': config['valid_metric_bigger'],
        'best_valid_result': best_valid_result,
        'test_result': test_result
    }
Ejemplo n.º 7
0
def run_recbole(model=None,
                dataset=None,
                config_file_list=None,
                config_dict=None,
                saved=True):
    r""" A fast running api, which includes the complete process of
    training and testing a model on a specified dataset

    Args:
        model (str): model name
        dataset (str): dataset name
        config_file_list (list): config files used to modify experiment parameters
        config_dict (dict): parameters dictionary used to modify experiment parameters
        saved (bool): whether to save the model
    """
    # configurations initialization
    config = Config(model=model,
                    dataset=dataset,
                    config_file_list=config_file_list,
                    config_dict=config_dict)
    init_seed(config['seed'], config['reproducibility'])

    # logger initialization
    init_logger(config)
    logger = getLogger()

    logger.info(config)

    # dataset filtering
    dataset = create_dataset(config)
    logger.info(dataset)

    # dataset splitting
    train_data, valid_data, test_data = data_preparation(config, dataset)

    # model loading and initialization
    model = get_model(config['model'])(config, train_data).to(config['device'])
    logger.info(model)

    # trainer loading and initialization
    trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

    # model training
    best_valid_score, best_valid_result = trainer.fit(train_data,
                                                      valid_data,
                                                      saved=saved)

    # model evaluation
    test_result = trainer.evaluate(test_data, load_best_model=saved)

    logger.info('best valid result: {}'.format(best_valid_result))
    logger.info('test result: {}'.format(test_result))

    return {
        'best_valid_score': best_valid_score,
        'valid_score_bigger': config['valid_metric_bigger'],
        'best_valid_result': best_valid_result,
        'test_result': test_result
    }
Ejemplo n.º 8
0
def run_recbole(model=None, dataset=None, config_file_list=None, config_dict=None, saved=True):
    r""" A fast running api, which includes the complete process of
    training and testing a model on a specified dataset

    Args:
        model (str, optional): Model name. Defaults to ``None``.
        dataset (str, optional): Dataset name. Defaults to ``None``.
        config_file_list (list, optional): Config files used to modify experiment parameters. Defaults to ``None``.
        config_dict (dict, optional): Parameters dictionary used to modify experiment parameters. Defaults to ``None``.
        saved (bool, optional): Whether to save the model. Defaults to ``True``.
    """
    # configurations initialization
    config = Config(model=model, dataset=dataset, config_file_list=config_file_list, config_dict=config_dict)
    init_seed(config['seed'], config['reproducibility'])
    # logger initialization
    init_logger(config)
    logger = getLogger()

    logger.info(config)

    # dataset filtering
    dataset = create_dataset(config)
    if config['save_dataset']:
        dataset.save()
    logger.info(dataset)

    # dataset splitting
    train_data, valid_data, test_data = data_preparation(config, dataset)
    if config['save_dataloaders']:
        save_split_dataloaders(config, dataloaders=(train_data, valid_data, test_data))

    # model loading and initialization
    model = get_model(config['model'])(config, train_data.dataset).to(config['device'])
    logger.info(model)

    # trainer loading and initialization
    trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

    # model training
    best_valid_score, best_valid_result = trainer.fit(
        train_data, valid_data, saved=saved, show_progress=config['show_progress']
    )

    # model evaluation
    test_result = trainer.evaluate(test_data, load_best_model=saved, show_progress=config['show_progress'])

    logger.info(set_color('best valid ', 'yellow') + f': {best_valid_result}')
    logger.info(set_color('test result', 'yellow') + f': {test_result}')

    return {
        'best_valid_score': best_valid_score,
        'valid_score_bigger': config['valid_metric_bigger'],
        'best_valid_result': best_valid_result,
        'test_result': test_result
    }
Ejemplo n.º 9
0
def load_data_and_model(model_file, dataset_file=None, dataloader_file=None):
    r"""Load filtered dataset, split dataloaders and saved model.

    Args:
        model_file (str): The path of saved model file.
        dataset_file (str, optional): The path of filtered dataset. Defaults to ``None``.
        dataloader_file (str, optional): The path of split dataloaders. Defaults to ``None``.

    Note:
        The :attr:`dataset` will be loaded or created according to the following strategy:
        If :attr:`dataset_file` is not ``None``, the :attr:`dataset` will be loaded from :attr:`dataset_file`.
        If :attr:`dataset_file` is ``None`` and :attr:`dataloader_file` is ``None``,
        the :attr:`dataset` will be created according to :attr:`config`.
        If :attr:`dataset_file` is ``None`` and :attr:`dataloader_file` is not ``None``,
        the :attr:`dataset` will neither be loaded or created.

        The :attr:`dataloader` will be loaded or created according to the following strategy:
        If :attr:`dataloader_file` is not ``None``, the :attr:`dataloader` will be loaded from :attr:`dataloader_file`.
        If :attr:`dataloader_file` is ``None``, the :attr:`dataloader` will be created according to :attr:`config`.

    Returns:
        tuple:
            - config (Config): An instance object of Config, which record parameter information in :attr:`model_file`.
            - model (AbstractRecommender): The model load from :attr:`model_file`.
            - dataset (Dataset): The filtered dataset.
            - train_data (AbstractDataLoader): The dataloader for training.
            - valid_data (AbstractDataLoader): The dataloader for validation.
            - test_data (AbstractDataLoader): The dataloader for testing.
    """
    checkpoint = torch.load(model_file)
    config = checkpoint['config']
    init_logger(config)

    dataset = None
    if dataset_file:
        with open(dataset_file, 'rb') as f:
            dataset = pickle.load(f)

    if dataloader_file:
        train_data, valid_data, test_data = load_split_dataloaders(dataloader_file)
    else:
        if dataset is None:
            dataset = create_dataset(config)
        train_data, valid_data, test_data = data_preparation(config, dataset)

    model = get_model(config['model'])(config, train_data.dataset).to(config['device'])
    model.load_state_dict(checkpoint['state_dict'])
    model.load_other_parameter(checkpoint.get('other_parameter'))

    return config, model, dataset, train_data, valid_data, test_data
Ejemplo n.º 10
0
def load_example():
    # configurations initialization
    config_dict = {'checkpoint_dir': '../saved'}
    config = Config(model='BPR', dataset='ml-100k', config_dict=config_dict)
    init_seed(config['seed'], config['reproducibility'])
    init_logger(config)
    logger = getLogger()

    with open('../saved/ml-100k-dataset.pth',
              'rb') as f:  # You can use your filtered data path here.
        dataset = pickle.load(f)

    train_data, valid_data, test_data = load_split_dataloaders(
        '../saved/ml-100k-for-BPR-dataloader.pth')
    # You can use your split data path here.

    model = get_model(config['model'])(config, train_data).to(config['device'])
    checkpoint = torch.load('../saved/BPR-Mar-20-2021_17-11-05.pth'
                            )  # Here you can replace it by your model path.
    model.load_state_dict(checkpoint['state_dict'])
    logger.info(model)
    logger.info(train_data.dataset)
    logger.info(valid_data.dataset)
    logger.info(test_data.dataset)
        test_data = get_dataloader(config, 'test')(config,
                                                   new_test_dataset,
                                                   None,
                                                   shuffle=False)
    else:
        train_data = get_dataloader(config, 'train')(config,
                                                     train_dataset,
                                                     None,
                                                     shuffle=True)
        test_data = get_dataloader(config, 'test')(config,
                                                   test_dataset,
                                                   None,
                                                   shuffle=False)

    # model loading and initialization
    model = get_model(config['model'])(config,
                                       train_data.dataset).to(config['device'])
    logger.info(model)

    # trainer loading and initialization
    trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

    # model training and evaluation
    test_score, test_result = trainer.fit(
        train_data,
        test_data,
        saved=True,
        show_progress=config['show_progress'])

    logger.info(set_color('test result', 'yellow') + f': {test_result}')
Ejemplo n.º 12
0
from recbole.utils.case_study import full_sort_topk, full_sort_scores


if __name__ == '__main__':
    # this part is to load saved model.
    config_dict = {
        # here you can set some parameters such as `gpu_id` and so on.
    }
    config = Config(model='BPR', dataset='ml-100k', config_dict=config_dict)
    init_seed(config['seed'], config['reproducibility'])
    dataset = create_dataset(config)
    train_data, valid_data, test_data = data_preparation(config, dataset)
    # Here you can also use `load_split_dataloaders` to load data.
    # The example code for `load_split_dataloaders` can be found in `save_and_load_example.py`.

    model = get_model(config['model'])(config, train_data)
    checkpoint = torch.load('RecBole/saved/BPR-Dec-08-2020_15-37-37.pth')  # Here you can replace it by your model path.
    model.load_state_dict(checkpoint['state_dict'])
    model.eval()

    # uid_series = np.array([1, 2])  # internal user id series
    # or you can use dataset.token2id to transfer external user token to internal user id
    uid_series = dataset.token2id(dataset.uid_field, ['200'])

    topk_score, topk_iid_list = full_sort_topk(uid_series, model, test_data, k=10)
    print(topk_score)  # scores of top 10 items
    print(topk_iid_list)  # internal id of top 10 items
    external_item_list = dataset.id2token(dataset.iid_field, topk_iid_list)
    print(external_item_list)  # external tokens of top 10 items
    print()
Ejemplo n.º 13
0
    def _load_internal_config_dict(self, model, dataset):
        current_path = os.path.dirname(os.path.realpath(__file__))
        overall_init_file = os.path.join(current_path,
                                         '../properties/overall.yaml')
        model_init_file = os.path.join(
            current_path, '../properties/model/' + model + '.yaml')
        sample_init_file = os.path.join(current_path,
                                        '../properties/dataset/sample.yaml')
        dataset_init_file = os.path.join(
            current_path, '../properties/dataset/' + dataset + '.yaml')

        self.internal_config_dict = dict()
        for file in [
                overall_init_file, model_init_file, sample_init_file,
                dataset_init_file
        ]:
            if os.path.isfile(file):
                with open(file, 'r', encoding='utf-8') as f:
                    config_dict = yaml.load(f.read(), Loader=self.yaml_loader)
                    if file == dataset_init_file:
                        self.parameters['Dataset'] += [
                            key for key in config_dict.keys()
                            if key not in self.parameters['Dataset']
                        ]
                    if config_dict is not None:
                        self.internal_config_dict.update(config_dict)

        self.internal_config_dict['MODEL_TYPE'] = get_model(model).type
        if self.internal_config_dict['MODEL_TYPE'] == ModelType.GENERAL:
            pass
        elif self.internal_config_dict['MODEL_TYPE'] == ModelType.CONTEXT:
            self.internal_config_dict.update({
                'eval_setting': 'RO_RS',
                'group_by_user': False,
                'training_neg_sample_num': 0,
                'metrics': ['AUC', 'LogLoss'],
                'valid_metric': 'AUC',
            })
            if dataset == 'ml-100k':
                self.internal_config_dict.update({
                    'threshold': {
                        'rating': 4
                    },
                    'load_col': {
                        'inter': ['user_id', 'item_id', 'rating', 'timestamp'],
                        'user': ['user_id', 'age', 'gender', 'occupation'],
                        'item': ['item_id', 'release_year', 'class']
                    },
                })

        elif self.internal_config_dict['MODEL_TYPE'] == ModelType.SEQUENTIAL:
            if model == 'DIN':
                self.internal_config_dict.update({
                    'eval_setting': 'TO_LS, uni100',
                    'metrics': ['AUC', 'LogLoss'],
                    'valid_metric': 'AUC',
                })
                if dataset == 'ml-100k':
                    self.internal_config_dict.update({
                        'load_col': {
                            'inter':
                            ['user_id', 'item_id', 'rating', 'timestamp'],
                            'user': ['user_id', 'age', 'gender', 'occupation'],
                            'item': ['item_id', 'release_year']
                        },
                    })

            else:
                self.internal_config_dict.update({
                    'eval_setting': 'TO_LS,full',
                })
                if dataset == 'ml-100k' and model in [
                        'GRU4RecF', 'SASRecF', 'FDSA', 'S3Rec'
                ]:
                    self.internal_config_dict.update({
                        'load_col': {
                            'inter':
                            ['user_id', 'item_id', 'rating', 'timestamp'],
                            'item': ['item_id', 'release_year', 'class']
                        },
                    })

        elif self.internal_config_dict['MODEL_TYPE'] == ModelType.KNOWLEDGE:
            self.internal_config_dict.update({
                'load_col': {
                    'inter': ['user_id', 'item_id', 'rating', 'timestamp'],
                    'kg': ['head_id', 'relation_id', 'tail_id'],
                    'link': ['item_id', 'entity_id']
                }
            })