コード例 #1
0
    def __init__(self, config, data_tuple, side_information_data, *args, **kwargs):
        self.logger = logging.get_logger(self.__class__.__name__, pylog.CRITICAL if config.config_test else pylog.DEBUG)
        self.config = config
        self.side_information_data = side_information_data
        self.args = args
        self.kwargs = kwargs
        self.train_dict = self.dataframe_to_dict(data_tuple[0])

        self.users = list(self.train_dict.keys())
        self.num_users = len(self.users)
        self.items = list({k for a in self.train_dict.values() for k in a.keys()})
        self.num_items = len(self.items)

        self.features = list({f for i in self.items for f in self.side_information_data.feature_map[i]})
        self.nfeatures = len(self.features)
        self.private_users = {p: u for p, u in enumerate(self.users)}
        self.public_users = {v: k for k, v in self.private_users.items()}
        self.private_items = {p: i for p, i in enumerate(self.items)}
        self.public_items = {v: k for k, v in self.private_items.items()}
        self.private_features = {p: f for p, f in enumerate(self.features)}
        self.public_features = {v: k for k, v in self.private_features.items()}
        self.transactions = sum(len(v) for v in self.train_dict.values())

        self.i_train_dict = {self.public_users[user]: {self.public_items[i]: v for i, v in items.items()}
                                for user, items in self.train_dict.items()}

        self.sp_i_train = self.build_sparse()
        self.sp_i_train_ratings = self.build_sparse_ratings()

        if len(data_tuple) == 2:
            self.test_dict = self.build_dict(data_tuple[1], self.users)
        else:
            self.val_dict = self.build_dict(data_tuple[1], self.users)
            self.test_dict = self.build_dict(data_tuple[2], self.users)
コード例 #2
0
def config_test(builder, base):
    if base.base_namespace.config_test:
        logging_project.init(base.base_namespace.path_logger_config, base.base_namespace.path_log_folder)
        logger = logging_project.get_logger("__main__")
        logger.info("Start config test")
        base.base_namespace.evaluation.relevance_threshold = getattr(base.base_namespace.evaluation, "relevance_threshold", 0)
        res_handler = ResultHandler(rel_threshold=base.base_namespace.evaluation.relevance_threshold)
        hyper_handler = HyperParameterStudy(rel_threshold=base.base_namespace.evaluation.relevance_threshold)
        dataloader_class = getattr(importlib.import_module("elliot.dataset"), base.base_namespace.data_config.dataloader)
        dataloader = dataloader_class(config=base.base_namespace)
        data_test_list = dataloader.generate_dataobjects_mock()
        for key, model_base in builder.models():
            test_results = []
            test_trials = []
            for data_test in data_test_list:
                if key.startswith("external."):
                    spec = importlib.util.spec_from_file_location("external",
                                                                  path.relpath(base.base_namespace.external_models_path))
                    external = importlib.util.module_from_spec(spec)
                    sys.modules[spec.name] = external
                    spec.loader.exec_module(external)
                    model_class = getattr(importlib.import_module("external"), key.split(".", 1)[1])
                else:
                    model_class = getattr(importlib.import_module("elliot.recommender"), key)

                model_base_mock = model_base
                model_base_mock = _reset_verbose_option(model_base_mock)
                model_placeholder = ho.ModelCoordinator(data_test, base.base_namespace, model_base_mock, model_class)
                if isinstance(model_base, tuple):
                    trials = Trials()
                    fmin(model_placeholder.objective,
                                space=model_base_mock[1],
                                algo=model_base_mock[3],
                                trials=trials,
                                rstate=_rstate,
                                max_evals=model_base_mock[2])

                    min_val = np.argmin([i["result"]["loss"] for i in trials._trials])

                    test_results.append(trials._trials[min_val]["result"])
                    test_trials.append(trials)
                else:
                    single = model_placeholder.single()

                    test_results.append(single)

            min_val = np.argmin([i["loss"] for i in test_results])

            res_handler.add_oneshot_recommender(**test_results[min_val])

            if isinstance(model_base, tuple):
                hyper_handler.add_trials(test_trials[min_val])
        logger.info("End config test without issues")
    base.base_namespace.config_test = False
コード例 #3
0
 def __init__(self, recommendations, config, params, eval_objects):
     """
     Constructor
     :param recommendations: list of recommendations in the form {user: [(item1,value1),...]}
     :param config: SimpleNameSpace that represents the configuration of the experiment
     :param params: Parameters of the model
     :param eval_objects: list of objects that may be useful for the computation of the different metrics
     """
     super().__init__(recommendations, config, params, eval_objects)
     self.logger = logging.get_logger("Evaluator",  pylog.CRITICAL if config.config_test else pylog.DEBUG)
     self._cutoff = self._evaluation_objects.cutoff
     self._relevance = self._evaluation_objects.relevance.binary_relevance
     self._num_items = self._evaluation_objects.num_items
コード例 #4
0
ファイル: evaluator.py プロジェクト: sumitsidana/elliot
    def __init__(self, data: ds.DataSet, params: SimpleNamespace):
        """
        Class to manage all the evaluation methods and operation
        :param data: dataset object
        :param k: top-k evaluation
        """
        self.logger = logging.get_logger(
            self.__class__.__name__,
            pylog.CRITICAL if data.config.config_test else pylog.DEBUG)
        self._data = data
        self._params = params
        self._k = getattr(data.config.evaluation, "cutoffs",
                          [data.config.top_k])
        self._k = self._k if isinstance(self._k, list) else [self._k]
        if any(np.array(self._k) > data.config.top_k):
            raise Exception(
                "Cutoff values must be smaller than recommendation list length (top_k)"
            )
        self._rel_threshold = data.config.evaluation.relevance_threshold
        self._paired_ttest = self._data.config.evaluation.paired_ttest
        self._metrics = metrics.parse_metrics(
            data.config.evaluation.simple_metrics)
        #TODO
        _validation_metric = getattr(self._params.meta, "validation_metric",
                                     "nDCG@10").split("@")[0]
        if _validation_metric.lower() not in [
                m.lower() for m in data.config.evaluation.simple_metrics
        ]:
            raise Exception(
                "Validation metric must be in list of general metrics")
        self._complex_metrics = getattr(data.config.evaluation,
                                        "complex_metrics", {})
        self._test = data.get_test()

        self._pop = popularity_utils.Popularity(self._data)

        self._evaluation_objects = SimpleNamespace(
            relevance=relevance.Relevance(self._test, self._rel_threshold),
            pop=self._pop,
            num_items=self._data.num_items,
            data=self._data,
            additional_metrics=self._complex_metrics)
        if data.get_validation():
            self._val = data.get_validation()
            self._val_evaluation_objects = SimpleNamespace(
                relevance=relevance.Relevance(self._val, self._rel_threshold),
                pop=self._pop,
                num_items=self._data.num_items,
                data=self._data,
                additional_metrics=self._complex_metrics)
        self._needed_recommendations = self._compute_needed_recommendations()
コード例 #5
0
ファイル: dataset.py プロジェクト: sisinflab/KGFlex
    def __init__(self, config, data_tuple, *args, **kwargs):
        """
        Constructor of DataSet
        :param path_train_data: relative path for train file
        :param path_test_data: relative path for test file
        """
        self.logger = logging.get_logger(self.__class__.__name__, pylog.CRITICAL if config.config_test else
                                         pylog.DEBUG)
        self.config = config
        self.args = args
        self.kwargs = kwargs
        self.train_dict = self.dataframe_to_dict(data_tuple[0])

        self.users = list(self.train_dict.keys())
        self.items = list({k for a in self.train_dict.values() for k in a.keys()})
        self.num_users = len(self.users)
        self.num_items = len(self.items)
        self.transactions = sum(len(v) for v in self.train_dict.values())

        self.private_users = {p: u for p, u in enumerate(self.users)}
        self.public_users = {v: k for k, v in self.private_users.items()}
        self.private_items = {p: i for p, i in enumerate(self.items)}
        self.public_items = {v: k for k, v in self.private_items.items()}

        self.i_train_dict = {self.public_users[user]: {self.public_items[i]: v for i, v in items.items()}
                                for user, items in self.train_dict.items()}

        self.sp_i_train = self.build_sparse()
        self.sp_i_train_ratings = self.build_sparse_ratings()

        if len(data_tuple) == 2:
            self.test_dict = self.build_dict(data_tuple[1], self.users)
            if hasattr(config, "negative_sampling"):
                val_neg_samples, test_neg_samples = NegativeSampler.sample(config, self.public_users, self.public_items, self.sp_i_train, None, self.test_dict)
                sp_i_test = self.to_bool_sparse(self.test_dict)
                test_candidate_items = test_neg_samples + sp_i_test
                self.test_mask = np.where((test_candidate_items.toarray() == True), True, False)
        else:
            self.val_dict = self.build_dict(data_tuple[1], self.users)
            self.test_dict = self.build_dict(data_tuple[2], self.users)
            if hasattr(config, "negative_sampling"):
                val_neg_samples, test_neg_samples = NegativeSampler.sample(config, self.public_users, self.public_items, self.sp_i_train, self.val_dict, self.test_dict)
                sp_i_val = self.to_bool_sparse(self.val_dict)
                sp_i_test = self.to_bool_sparse(self.test_dict)
                val_candidate_items = val_neg_samples + sp_i_val
                self.val_mask = np.where((val_candidate_items.toarray() == True), True, False)
                test_candidate_items = test_neg_samples + sp_i_test
                self.test_mask = np.where((test_candidate_items.toarray() == True), True, False)

        self.allunrated_mask = np.where((self.sp_i_train.toarray() == 0), True, False)
コード例 #6
0
    def __init__(self, data: ds.DataSet, params: SimpleNamespace):
        """
        Class to manage all the evaluation methods and operation
        :param data: dataset object
        :param k: top-k evaluation
        """
        self.logger = logging.get_logger(
            self.__class__.__name__,
            pylog.CRITICAL if data.config.config_test else pylog.DEBUG)
        self._data = data
        self._params = params
        self._k = getattr(data.config.evaluation, "cutoffs",
                          [data.config.top_k])
        self._k = self._k if isinstance(self._k, list) else [self._k]
        if any(np.array(self._k) > data.config.top_k):
            raise Exception(
                "Cutoff values must be smaller than recommendation list length (top_k)"
            )
        self._rel_threshold = data.config.evaluation.relevance_threshold
        self._paired_ttest = self._data.config.evaluation.paired_ttest
        self._metrics = metrics.parse_metrics(
            data.config.evaluation.simple_metrics)
        self._complex_metrics = getattr(data.config.evaluation,
                                        "complex_metrics", dict())
        #TODO integrate complex metrics in validation metric (the problem is that usually complex metrics generate a complex name that does not match with the base name when looking for the loss value)
        # if _validation_metric.lower() not in [m.lower()
        #                                       for m in data.config.evaluation.simple_metrics]+[m["metric"].lower()
        #                                                                                        for m in self._complex_metrics]:
        #     raise Exception("Validation metric must be in list of general metrics")
        self._test = data.get_test()

        self._pop = popularity_utils.Popularity(self._data)

        self._evaluation_objects = SimpleNamespace(
            relevance=relevance.Relevance(self._test, self._rel_threshold),
            pop=self._pop,
            num_items=self._data.num_items,
            data=self._data,
            additional_metrics=self._complex_metrics)
        if data.get_validation():
            self._val = data.get_validation()
            self._val_evaluation_objects = SimpleNamespace(
                relevance=relevance.Relevance(self._val, self._rel_threshold),
                pop=self._pop,
                num_items=self._data.num_items,
                data=self._data,
                additional_metrics=self._complex_metrics)
        self._needed_recommendations = self._compute_needed_recommendations()
コード例 #7
0
    def __init__(self, data_objs, base: SimpleNamespace, params,
                 model_class: t.ClassVar):
        """
        The constructor creates a Placeholder of the recommender model.

        :param base: a SimpleNamespace that contains the configuration (main level) options
        :param params: a SimpleNamespace that contains the hyper-parameters of the model
        :param model_class: the class of the recommendation model
        """
        self.logger = logging.get_logger(
            self.__class__.__name__,
            pylog.CRITICAL if base.config_test else pylog.DEBUG)
        self.data_objs = data_objs
        self.base = base
        self.params = params
        self.model_class = model_class
コード例 #8
0
    def __init__(self, config, data_tuple, *args, **kwargs):
        """
        Constructor of DataSet
        :param path_train_data: relative path for train file
        :param path_test_data: relative path for test file
        """
        self.logger = logging.get_logger(
            self.__class__.__name__,
            pylog.CRITICAL if config.config_test else pylog.DEBUG)
        self.config = config
        self.args = args
        self.kwargs = kwargs
        self.train_dict = self.dataframe_to_dict(data_tuple[0])

        self.users = list(self.train_dict.keys())
        self.items = list(
            {k
             for a in self.train_dict.values() for k in a.keys()})
        self.num_users = len(self.users)
        self.num_items = len(self.items)
        self.transactions = sum(len(v) for v in self.train_dict.values())

        self.private_users = {p: u for p, u in enumerate(self.users)}
        self.public_users = {v: k for k, v in self.private_users.items()}
        self.private_items = {p: i for p, i in enumerate(self.items)}
        self.public_items = {v: k for k, v in self.private_items.items()}

        self.i_train_dict = {
            self.public_users[user]:
            {self.public_items[i]: v
             for i, v in items.items()}
            for user, items in self.train_dict.items()
        }

        self.sp_i_train = self.build_sparse()
        self.sp_i_train_ratings = self.build_sparse_ratings()

        if len(data_tuple) == 2:
            self.test_dict = self.build_dict(data_tuple[1], self.users)
        else:
            self.val_dict = self.build_dict(data_tuple[1], self.users)
            self.test_dict = self.build_dict(data_tuple[2], self.users)
コード例 #9
0
ファイル: visual_dataloader.py プロジェクト: g-i-v/elliot
    def read_single_image(images_folder, image_set, size_tuple, image_path):
        image_id = int(image_path.split(".")[0])
        if image_id in image_set:
            try:
                im_pos = Image.open(os.path.join(images_folder, image_path))
                im_pos.load()

                if im_pos.mode != 'RGB':
                    im_pos = im_pos.convert(mode='RGB')

                if size_tuple:
                    im_pos = np.array(
                        im_pos.resize(size_tuple)) / np.float32(255)

                return {image_id: im_pos}
            except (ValueError, PIL.UnidentifiedImageError) as er:
                _logger = logging.get_logger(__class__.__name__)
                _logger.error(
                    f'Image at path {os.path.join(images_folder, image_path)} was not loaded correctly!'
                )
                _logger.error(er)
コード例 #10
0
ファイル: visual_dataloader.py プロジェクト: g-i-v/elliot
    def __init__(self, config, data_tuple, side_information_data, *args,
                 **kwargs):
        self.logger = logging.get_logger(
            self.__class__.__name__,
            pylog.CRITICAL if config.config_test else pylog.DEBUG)
        self.config = config
        self.side_information_data = side_information_data
        self.args = args
        self.kwargs = kwargs
        self.train_dict = self.dataframe_to_dict(data_tuple[0])

        if self.side_information_data.visual_feature_path:
            self.visual_features = np.load(
                self.side_information_data.visual_feature_path)
            self.item_mapping = pd.read_csv(
                self.side_information_data.item_mapping_path,
                sep="\t",
                header=None)
            self.item_mapping = {
                i: j
                for i, j in zip(self.item_mapping[0], self.item_mapping[1])
            }

        if self.side_information_data.images_src_folder:
            self.output_image_size = literal_eval(
                self.side_information_data.size_tuple
            ) if self.side_information_data.size_tuple else None
            self.item_mapping = pd.read_csv(
                self.side_information_data.item_mapping_path,
                sep="\t",
                header=None)
            self.item_mapping = {
                i: j
                for i, j in zip(self.item_mapping[0], self.item_mapping[1])
            }
            # self.image_dict = self.read_images_multiprocessing(self.side_information_data.images_src_folder, self.side_information_data.aligned_items, self.output_image_size)

        self.users = list(self.train_dict.keys())
        self.num_users = len(self.users)
        self.items = list(self.side_information_data.aligned_items)
        self.num_items = len(self.items)

        self.private_users = {p: u for p, u in enumerate(self.users)}
        self.public_users = {v: k for k, v in self.private_users.items()}
        self.private_items = {p: i for p, i in enumerate(self.items)}
        self.public_items = {v: k for k, v in self.private_items.items()}
        self.transactions = sum(len(v) for v in self.train_dict.values())

        self.i_train_dict = {
            self.public_users[user]:
            {self.public_items[i]: v
             for i, v in items.items()}
            for user, items in self.train_dict.items()
        }

        self.sp_i_train = self.build_sparse()
        self.sp_i_train_ratings = self.build_sparse_ratings()

        if len(data_tuple) == 2:
            self.test_dict = self.build_dict(data_tuple[1], self.users)
        else:
            self.val_dict = self.build_dict(data_tuple[1], self.users)
            self.test_dict = self.build_dict(data_tuple[2], self.users)
コード例 #11
0
    def __init__(self, config, *args, **kwargs):
        """
        Constructor of DataSet
        :param path_train_data: relative path for train file
        :param path_test_data: relative path for test file
        """

        self.logger = logging.get_logger(self.__class__.__name__)
        self.args = args
        self.kwargs = kwargs
        self.config = config
        self.column_names = ['userId', 'itemId', 'rating', 'timestamp']
        if config.config_test:
            return
        if config.data_config.strategy == "fixed":
            path_train_data = config.data_config.train_path
            path_val_data = getattr(config.data_config, "validation_path",
                                    None)
            path_test_data = config.data_config.test_path

            self.train_dataframe = pd.read_csv(path_train_data,
                                               sep="\t",
                                               header=None,
                                               names=self.column_names)

            self.train_dataframe = self.check_timestamp(self.train_dataframe)

            self.logger.info(f"{path_train_data} - Loaded")

            self.test_dataframe = pd.read_csv(path_test_data,
                                              sep="\t",
                                              header=None,
                                              names=self.column_names)

            self.test_dataframe = self.check_timestamp(self.test_dataframe)

            if path_val_data:
                self.validation_dataframe = pd.read_csv(
                    path_val_data,
                    sep="\t",
                    header=None,
                    names=self.column_names)
                self.validation_dataframe = self.check_timestamp(
                    self.validation_dataframe)

                self.tuple_list = [([
                    (self.train_dataframe, self.validation_dataframe)
                ], self.test_dataframe)]
            else:
                self.tuple_list = [(self.train_dataframe, self.test_dataframe)]

        elif config.data_config.strategy == "hierarchy":
            self.tuple_list = self.read_splitting(
                config.data_config.root_folder)

        elif config.data_config.strategy == "dataset":
            self.logger.info("There will be the splitting")
            path_dataset = config.data_config.dataset_path

            self.dataframe = pd.read_csv(path_dataset,
                                         sep="\t",
                                         header=None,
                                         names=self.column_names)

            self.dataframe = self.check_timestamp(self.dataframe)

            self.logger.info(('{0} - Loaded'.format(path_dataset)))

            self.dataframe = PreFilter.filter(self.dataframe, self.config)

            splitter = Splitter(self.dataframe, self.config.splitting)
            self.tuple_list = splitter.process_splitting()

        else:
            raise Exception("Strategy option not recognized")
コード例 #12
0
ファイル: auc.py プロジェクト: kiminh/elliot
 def needs_full_recommendations():
     _logger = logging.get_logger("Evaluator")
     _logger.warn("AUC metric requires full length recommendations")
     return True
コード例 #13
0
 def needs_full_recommendations():
     _logger = logging.get_logger("Evaluator")
     _logger.warn(
         "WARNING: Mean Absolute Error metric requires full length recommendations"
     )
     return True
コード例 #14
0
ファイル: visual_dataloader.py プロジェクト: g-i-v/elliot
    def __init__(self, config, *args, **kwargs):
        """
        Constructor of DataSet
        :param path_train_data: relative path for train file
        :param path_test_data: relative path for test file
        """

        self.logger = logging.get_logger(self.__class__.__name__)
        self.args = args
        self.kwargs = kwargs
        self.config = config
        self.column_names = ['userId', 'itemId', 'rating', 'timestamp']
        if config.config_test:
            return

        if config.data_config.strategy == "fixed":
            path_train_data = config.data_config.train_path
            path_val_data = getattr(config.data_config, "validation_path",
                                    None)
            path_test_data = config.data_config.test_path
            visual_feature_path = getattr(config.data_config.side_information,
                                          "visual_features", None)
            item_mapping_path = getattr(config.data_config.side_information,
                                        "item_mapping", None)
            size_tuple = getattr(config.data_config.side_information,
                                 "output_image_size", None)

            if visual_feature_path and item_mapping_path:
                feature_set = set(
                    pd.read_csv(item_mapping_path, sep="\t",
                                header=None)[0].unique().tolist())
            else:
                feature_set = {}

            images_src_folder = getattr(config.data_config.side_information,
                                        "images_src_folder", None)

            if images_src_folder:
                image_set = {
                    int(path.split(".")[0])
                    for path in os.listdir(images_src_folder)
                }
            else:
                image_set = {}

            if feature_set and image_set:
                visual_set = feature_set and image_set
            elif feature_set:
                visual_set = feature_set
            elif image_set:
                visual_set = image_set
            else:
                visual_set = {}

            self.side_information_data = SimpleNamespace()

            self.train_dataframe, self.side_information_data.aligned_items = self.load_dataset_dataframe(
                path_train_data, "\t", visual_set)
            self.side_information_data.visual_feature_path = visual_feature_path
            self.side_information_data.item_mapping_path = item_mapping_path
            self.side_information_data.images_src_folder = images_src_folder
            self.side_information_data.size_tuple = size_tuple

            self.train_dataframe = self.check_timestamp(self.train_dataframe)

            self.logger.info('{0} - Loaded'.format(path_train_data))

            self.test_dataframe = pd.read_csv(path_test_data,
                                              sep="\t",
                                              header=None,
                                              names=self.column_names)
            self.test_dataframe = self.check_timestamp(self.test_dataframe)

            if path_val_data:
                self.validation_dataframe = pd.read_csv(
                    path_val_data,
                    sep="\t",
                    header=None,
                    names=self.column_names)
                self.validation_dataframe = self.check_timestamp(
                    self.validation_dataframe)

                self.tuple_list = [([
                    (self.train_dataframe, self.validation_dataframe)
                ], self.test_dataframe)]
            else:
                self.tuple_list = [(self.train_dataframe, self.test_dataframe)]

        elif config.data_config.strategy == "hierarchy":
            self.tuple_list = self.read_splitting(
                config.data_config.root_folder)

        elif config.data_config.strategy == "dataset":
            self.logger.info("There will be the splitting")
            path_dataset = config.data_config.dataset_path

            visual_feature_path = getattr(config.data_config.side_information,
                                          "visual_features", None)
            item_mapping_path = getattr(config.data_config.side_information,
                                        "item_mapping", None)
            size_tuple = getattr(config.data_config.side_information,
                                 "output_image_size", None)

            if visual_feature_path and item_mapping_path:
                feature_set = set(
                    pd.read_csv(item_mapping_path, sep="\t",
                                header=None)[0].unique().tolist())
            else:
                feature_set = {}

            images_src_folder = getattr(config.data_config.side_information,
                                        "images_src_folder", None)

            if images_src_folder:
                image_set = {
                    int(path.split(".")[0])
                    for path in os.listdir(images_src_folder)
                }
            else:
                image_set = {}

            if feature_set and image_set:
                visual_set = feature_set and image_set
            elif feature_set:
                visual_set = feature_set
            elif image_set:
                visual_set = image_set
            else:
                visual_set = {}

            self.side_information_data = SimpleNamespace()

            self.dataframe, self.side_information_data.aligned_items = self.load_dataset_dataframe(
                path_dataset, "\t", visual_set)
            self.side_information_data.visual_feature_path = visual_feature_path
            self.side_information_data.item_mapping_path = item_mapping_path
            self.side_information_data.images_src_folder = images_src_folder
            self.side_information_data.size_tuple = size_tuple

            self.dataframe = self.check_timestamp(self.dataframe)

            self.logger.info('{0} - Loaded'.format(path_dataset))

            self.dataframe = PreFilter.filter(self.dataframe, self.config)

            splitter = Splitter(self.dataframe, self.config.splitting)
            self.tuple_list = splitter.process_splitting()

        else:
            raise Exception("Strategy option not recognized")
コード例 #15
0
ファイル: kg_flex_chains.py プロジェクト: sisinflab/KGFlex
    def __init__(self, config, data_tuple, side_information_data, *args,
                 **kwargs):
        self.logger = logging.get_logger(
            self.__class__.__name__,
            pylog.CRITICAL if config.config_test else pylog.DEBUG)
        self.config = config
        self.side_information_data = side_information_data
        self.args = args
        self.kwargs = kwargs
        self.train_dict = self.dataframe_to_dict(data_tuple[0])
        self.train_pd = data_tuple[0]

        self.users = list(self.train_dict.keys())
        self.num_users = len(self.users)
        self.items = list(
            {k
             for a in self.train_dict.values() for k in a.keys()})
        self.num_items = len(self.items)

        # self.features = list({f for i in self.items for f in self.side_information_data.feature_map[i]})
        # self.factors = len(self.features)
        self.private_users = {p: u for p, u in enumerate(self.users)}
        self.public_users = {v: k for k, v in self.private_users.items()}
        self.private_items = {p: i for p, i in enumerate(self.items)}
        self.public_items = {v: k for k, v in self.private_items.items()}
        # self.private_features = {p: f for p, f in enumerate(self.features)}
        # self.public_features = {v: k for k, v in self.private_features.items()}
        self.transactions = sum(len(v) for v in self.train_dict.values())

        self.i_train_dict = {
            self.public_users[user]:
            {self.public_items[i]: v
             for i, v in items.items()}
            for user, items in self.train_dict.items()
        }

        self.sp_i_train = self.build_sparse()
        self.sp_i_train_ratings = self.build_sparse_ratings()

        if len(data_tuple) == 2:
            self.test_dict = self.build_dict(data_tuple[1], self.users)
        else:
            self.val_dict = self.build_dict(data_tuple[1], self.users)
            self.test_dict = self.build_dict(data_tuple[2], self.users)

        # KaHFM compatible features

        kgflex_feature_df = self.side_information_data.feature_map.copy()

        def f(x):
            return str(x["predicate"]) + "><" + str(x["object"])

        kgflex_feature_df["bind"] = kgflex_feature_df.apply(f, axis=1)

        nitems = kgflex_feature_df["itemId"].nunique()
        threshold = 0.93
        kgflex_feature_df = kgflex_feature_df.groupby('bind').filter(
            lambda x: (1 - len(x) / nitems) <= threshold)
        print(
            f"Number of KaHFM features: {kgflex_feature_df['bind'].nunique()} with Threshold: {threshold}"
        )

        feature_index = {
            k: p
            for p, k in enumerate(kgflex_feature_df["bind"].unique())
        }

        kgflex_feature_df["bind2"] = kgflex_feature_df["bind"].map(
            feature_index)
        kgflex_feature_df.drop(columns=["bind"], inplace=True)

        self.side_information_data.kahfm_feature_map = kgflex_feature_df.groupby(
            "itemId")["bind2"].apply(list).to_dict()

        self.features = list(set(feature_index.values()))
        self.private_features = {p: f for p, f in enumerate(self.features)}
        self.public_features = {v: k for k, v in self.private_features.items()}

        if len(data_tuple) == 2:
            self.test_dict = self.build_dict(data_tuple[1], self.users)
            if hasattr(config, "negative_sampling"):
                val_neg_samples, test_neg_samples = NegativeSampler.sample(
                    config, self.public_users, self.public_items,
                    self.sp_i_train, None, self.test_dict)
                sp_i_test = self.to_bool_sparse(self.test_dict)
                test_candidate_items = test_neg_samples + sp_i_test
                self.test_mask = np.where(
                    (test_candidate_items.toarray() == True), True, False)
        else:
            self.val_dict = self.build_dict(data_tuple[1], self.users)
            self.test_dict = self.build_dict(data_tuple[2], self.users)
            if hasattr(config, "negative_sampling"):
                val_neg_samples, test_neg_samples = NegativeSampler.sample(
                    config, self.public_users, self.public_items,
                    self.sp_i_train, self.val_dict, self.test_dict)
                sp_i_val = self.to_bool_sparse(self.val_dict)
                sp_i_test = self.to_bool_sparse(self.test_dict)
                val_candidate_items = val_neg_samples + sp_i_val
                self.val_mask = np.where(
                    (val_candidate_items.toarray() == True), True, False)
                test_candidate_items = test_neg_samples + sp_i_test
                self.test_mask = np.where(
                    (test_candidate_items.toarray() == True), True, False)

        self.allunrated_mask = np.where((self.sp_i_train.toarray() == 0), True,
                                        False)
コード例 #16
0
ファイル: kg_flex_chains.py プロジェクト: sisinflab/KGFlex
    def __init__(self, config, *args, **kwargs):
        """
        Constructor of DataSet
        :param path_train_data: relative path for train file
        :param path_test_data: relative path for test file
        """

        self.logger = logging.get_logger(self.__class__.__name__)
        self.args = args
        self.kwargs = kwargs
        self.config = config
        self.column_names = ['userId', 'itemId', 'rating', 'timestamp']
        if config.config_test:
            return

        self.side_information_data = SimpleNamespace()

        if config.data_config.strategy == "fixed":
            path_train_data = config.data_config.train_path
            path_val_data = getattr(config.data_config, "validation_path",
                                    None)
            path_test_data = config.data_config.test_path

            work_directory_path = config.data_config.side_information.work_directory
            map_path = config.data_config.side_information.map
            features_path = config.data_config.side_information.features
            predicates_path = config.data_config.side_information.predicates

            self.train_dataframe, self.side_information_data.feature_map, self.side_information_data.predicate_mapping = self.load_dataset_dataframe(
                path_train_data, predicates_path, features_path)

            self.train_dataframe = self.check_timestamp(self.train_dataframe)

            self.logger.info(f"{path_train_data} - Loaded")

            self.test_dataframe = pd.read_csv(path_test_data,
                                              sep="\t",
                                              header=None,
                                              names=self.column_names)

            self.test_dataframe = self.check_timestamp(self.test_dataframe)

            if config.binarize == True:
                self.test_dataframe["rating"] = 1
                self.train_dataframe["rating"] = 1

            if path_val_data:
                self.validation_dataframe = pd.read_csv(
                    path_val_data,
                    sep="\t",
                    header=None,
                    names=self.column_names)
                self.validation_dataframe = self.check_timestamp(
                    self.validation_dataframe)

                self.tuple_list = [([
                    (self.train_dataframe, self.validation_dataframe)
                ], self.test_dataframe)]
            else:
                self.tuple_list = [(self.train_dataframe, self.test_dataframe)]

        elif config.data_config.strategy == "hierarchy":
            item_mapping_path = getattr(config.data_config.side_information,
                                        "item_mapping", None)
            self.side_information_data.feature_map = self.load_attribute_file(
                item_mapping_path)

            self.tuple_list = self.read_splitting(
                config.data_config.root_folder)

            self.logger.info('{0} - Loaded'.format(
                config.data_config.root_folder))

        elif config.data_config.strategy == "dataset":
            self.logger.info("There will be the splitting")
            path_dataset = config.data_config.dataset_path

            work_directory_path = config.data_config.side_information.work_directory
            map_path = config.data_config.side_information.map
            features_path = config.data_config.side_information.features
            predicates_path = config.data_config.side_information.predicates

            self.dataframe, self.side_information_data.feature_map, self.side_information_data.predicate_mapping = self.load_dataset_dataframe(
                path_dataset, predicates_path, features_path)
            self.dataframe = self.check_timestamp(self.dataframe)

            self.logger.info(('{0} - Loaded'.format(path_dataset)))

            self.dataframe = PreFilter.filter(self.dataframe, self.config)

            if config.binarize == True:
                self.dataframe["rating"] = 1

            splitter = Splitter(self.dataframe, self.config.splitting)
            self.tuple_list = splitter.process_splitting()

        else:
            raise Exception("Strategy option not recognized")
コード例 #17
0
def run_experiment(config_path: str = './config/config.yml'):
    builder = NameSpaceBuilder(config_path, here,
                               path.abspath(path.dirname(config_path)))
    base = builder.base
    config_test(builder, base)
    logging_project.init(base.base_namespace.path_logger_config,
                         base.base_namespace.path_log_folder)
    logger = logging_project.get_logger("__main__")
    logger.info("Start experiment")
    base.base_namespace.evaluation.relevance_threshold = getattr(
        base.base_namespace.evaluation, "relevance_threshold", 0)
    res_handler = ResultHandler(
        rel_threshold=base.base_namespace.evaluation.relevance_threshold)
    hyper_handler = HyperParameterStudy(
        rel_threshold=base.base_namespace.evaluation.relevance_threshold)
    dataloader_class = getattr(importlib.import_module("elliot.dataset"),
                               base.base_namespace.data_config.dataloader)
    dataloader = dataloader_class(config=base.base_namespace)
    data_test_list = dataloader.generate_dataobjects()
    for key, model_base in builder.models():
        test_results = []
        test_trials = []
        for data_test in data_test_list:
            logging_project.prepare_logger(key,
                                           base.base_namespace.path_log_folder)
            if key.startswith("external."):
                spec = importlib.util.spec_from_file_location(
                    "external",
                    path.relpath(base.base_namespace.external_models_path))
                external = importlib.util.module_from_spec(spec)
                sys.modules[spec.name] = external
                spec.loader.exec_module(external)
                model_class = getattr(importlib.import_module("external"),
                                      key.split(".", 1)[1])
            else:
                model_class = getattr(
                    importlib.import_module("elliot.recommender"), key)

            model_placeholder = ho.ModelCoordinator(data_test,
                                                    base.base_namespace,
                                                    model_base, model_class)
            if isinstance(model_base, tuple):
                logger.info(f"Tuning begun for {model_class.__name__}\n")
                trials = Trials()
                best = fmin(model_placeholder.objective,
                            space=model_base[1],
                            algo=model_base[3],
                            trials=trials,
                            verbose=False,
                            rstate=_rstate,
                            max_evals=model_base[2])

                # argmin relativo alla combinazione migliore di iperparametri
                min_val = np.argmin(
                    [i["result"]["loss"] for i in trials._trials])
                ############################################
                best_model_loss = trials._trials[min_val]["result"]["loss"]
                best_model_params = trials._trials[min_val]["result"]["params"]
                best_model_results = trials._trials[min_val]["result"][
                    "test_results"]
                ############################################

                # aggiunta a lista performance test
                test_results.append(trials._trials[min_val]["result"])
                test_trials.append(trials)
                logger.info(f"Tuning ended for {model_class.__name__}")
            else:
                logger.info(f"Training begun for {model_class.__name__}\n")
                single = model_placeholder.single()

                ############################################
                best_model_loss = single["loss"]
                best_model_params = single["params"]
                best_model_results = single["test_results"]
                ############################################

                # aggiunta a lista performance test
                test_results.append(single)
                logger.info(f"Training ended for {model_class.__name__}")

            logger.info(f"Loss:\t{best_model_loss}")
            logger.info(f"Best Model params:\t{best_model_params}")
            logger.info(f"Best Model results:\t{best_model_results}")

        # Migliore sui test, aggiunta a performance totali
        min_val = np.argmin([i["loss"] for i in test_results])

        res_handler.add_oneshot_recommender(**test_results[min_val])

        if isinstance(model_base, tuple):
            hyper_handler.add_trials(test_trials[min_val])

    # res_handler.save_results(output=base.base_namespace.path_output_rec_performance)
    hyper_handler.save_trials(
        output=base.base_namespace.path_output_rec_performance)
    res_handler.save_best_results(
        output=base.base_namespace.path_output_rec_performance)
    first_metric = base.base_namespace.evaluation.simple_metrics[
        0] if base.base_namespace.evaluation.simple_metrics else ""
    res_handler.save_best_models(
        output=base.base_namespace.path_output_rec_performance,
        default_metric=first_metric)
    if hasattr(base.base_namespace, "print_results_as_triplets"
               ) and base.base_namespace.print_results_as_triplets == True:
        res_handler.save_best_results_as_triplets(
            output=base.base_namespace.path_output_rec_performance)
        hyper_handler.save_trials_as_triplets(
            output=base.base_namespace.path_output_rec_performance)
    if hasattr(base.base_namespace.evaluation,
               "paired_ttest") and base.base_namespace.evaluation.paired_ttest:
        res_handler.save_best_statistical_results(
            stat_test=StatTest.PairedTTest,
            output=base.base_namespace.path_output_rec_performance)
    if hasattr(
            base.base_namespace.evaluation,
            "wilcoxon_test") and base.base_namespace.evaluation.wilcoxon_test:
        res_handler.save_best_statistical_results(
            stat_test=StatTest.WilcoxonTest,
            output=base.base_namespace.path_output_rec_performance)

    logger.info("End experiment")