def __init__(self, config, data_tuple, side_information_data, *args, **kwargs): self.logger = logging.get_logger(self.__class__.__name__, pylog.CRITICAL if config.config_test else pylog.DEBUG) self.config = config self.side_information_data = side_information_data self.args = args self.kwargs = kwargs self.train_dict = self.dataframe_to_dict(data_tuple[0]) self.users = list(self.train_dict.keys()) self.num_users = len(self.users) self.items = list({k for a in self.train_dict.values() for k in a.keys()}) self.num_items = len(self.items) self.features = list({f for i in self.items for f in self.side_information_data.feature_map[i]}) self.nfeatures = len(self.features) self.private_users = {p: u for p, u in enumerate(self.users)} self.public_users = {v: k for k, v in self.private_users.items()} self.private_items = {p: i for p, i in enumerate(self.items)} self.public_items = {v: k for k, v in self.private_items.items()} self.private_features = {p: f for p, f in enumerate(self.features)} self.public_features = {v: k for k, v in self.private_features.items()} self.transactions = sum(len(v) for v in self.train_dict.values()) self.i_train_dict = {self.public_users[user]: {self.public_items[i]: v for i, v in items.items()} for user, items in self.train_dict.items()} self.sp_i_train = self.build_sparse() self.sp_i_train_ratings = self.build_sparse_ratings() if len(data_tuple) == 2: self.test_dict = self.build_dict(data_tuple[1], self.users) else: self.val_dict = self.build_dict(data_tuple[1], self.users) self.test_dict = self.build_dict(data_tuple[2], self.users)
def config_test(builder, base): if base.base_namespace.config_test: logging_project.init(base.base_namespace.path_logger_config, base.base_namespace.path_log_folder) logger = logging_project.get_logger("__main__") logger.info("Start config test") base.base_namespace.evaluation.relevance_threshold = getattr(base.base_namespace.evaluation, "relevance_threshold", 0) res_handler = ResultHandler(rel_threshold=base.base_namespace.evaluation.relevance_threshold) hyper_handler = HyperParameterStudy(rel_threshold=base.base_namespace.evaluation.relevance_threshold) dataloader_class = getattr(importlib.import_module("elliot.dataset"), base.base_namespace.data_config.dataloader) dataloader = dataloader_class(config=base.base_namespace) data_test_list = dataloader.generate_dataobjects_mock() for key, model_base in builder.models(): test_results = [] test_trials = [] for data_test in data_test_list: if key.startswith("external."): spec = importlib.util.spec_from_file_location("external", path.relpath(base.base_namespace.external_models_path)) external = importlib.util.module_from_spec(spec) sys.modules[spec.name] = external spec.loader.exec_module(external) model_class = getattr(importlib.import_module("external"), key.split(".", 1)[1]) else: model_class = getattr(importlib.import_module("elliot.recommender"), key) model_base_mock = model_base model_base_mock = _reset_verbose_option(model_base_mock) model_placeholder = ho.ModelCoordinator(data_test, base.base_namespace, model_base_mock, model_class) if isinstance(model_base, tuple): trials = Trials() fmin(model_placeholder.objective, space=model_base_mock[1], algo=model_base_mock[3], trials=trials, rstate=_rstate, max_evals=model_base_mock[2]) min_val = np.argmin([i["result"]["loss"] for i in trials._trials]) test_results.append(trials._trials[min_val]["result"]) test_trials.append(trials) else: single = model_placeholder.single() test_results.append(single) min_val = np.argmin([i["loss"] for i in test_results]) res_handler.add_oneshot_recommender(**test_results[min_val]) if isinstance(model_base, tuple): hyper_handler.add_trials(test_trials[min_val]) logger.info("End config test without issues") base.base_namespace.config_test = False
def __init__(self, recommendations, config, params, eval_objects): """ Constructor :param recommendations: list of recommendations in the form {user: [(item1,value1),...]} :param config: SimpleNameSpace that represents the configuration of the experiment :param params: Parameters of the model :param eval_objects: list of objects that may be useful for the computation of the different metrics """ super().__init__(recommendations, config, params, eval_objects) self.logger = logging.get_logger("Evaluator", pylog.CRITICAL if config.config_test else pylog.DEBUG) self._cutoff = self._evaluation_objects.cutoff self._relevance = self._evaluation_objects.relevance.binary_relevance self._num_items = self._evaluation_objects.num_items
def __init__(self, data: ds.DataSet, params: SimpleNamespace): """ Class to manage all the evaluation methods and operation :param data: dataset object :param k: top-k evaluation """ self.logger = logging.get_logger( self.__class__.__name__, pylog.CRITICAL if data.config.config_test else pylog.DEBUG) self._data = data self._params = params self._k = getattr(data.config.evaluation, "cutoffs", [data.config.top_k]) self._k = self._k if isinstance(self._k, list) else [self._k] if any(np.array(self._k) > data.config.top_k): raise Exception( "Cutoff values must be smaller than recommendation list length (top_k)" ) self._rel_threshold = data.config.evaluation.relevance_threshold self._paired_ttest = self._data.config.evaluation.paired_ttest self._metrics = metrics.parse_metrics( data.config.evaluation.simple_metrics) #TODO _validation_metric = getattr(self._params.meta, "validation_metric", "nDCG@10").split("@")[0] if _validation_metric.lower() not in [ m.lower() for m in data.config.evaluation.simple_metrics ]: raise Exception( "Validation metric must be in list of general metrics") self._complex_metrics = getattr(data.config.evaluation, "complex_metrics", {}) self._test = data.get_test() self._pop = popularity_utils.Popularity(self._data) self._evaluation_objects = SimpleNamespace( relevance=relevance.Relevance(self._test, self._rel_threshold), pop=self._pop, num_items=self._data.num_items, data=self._data, additional_metrics=self._complex_metrics) if data.get_validation(): self._val = data.get_validation() self._val_evaluation_objects = SimpleNamespace( relevance=relevance.Relevance(self._val, self._rel_threshold), pop=self._pop, num_items=self._data.num_items, data=self._data, additional_metrics=self._complex_metrics) self._needed_recommendations = self._compute_needed_recommendations()
def __init__(self, config, data_tuple, *args, **kwargs): """ Constructor of DataSet :param path_train_data: relative path for train file :param path_test_data: relative path for test file """ self.logger = logging.get_logger(self.__class__.__name__, pylog.CRITICAL if config.config_test else pylog.DEBUG) self.config = config self.args = args self.kwargs = kwargs self.train_dict = self.dataframe_to_dict(data_tuple[0]) self.users = list(self.train_dict.keys()) self.items = list({k for a in self.train_dict.values() for k in a.keys()}) self.num_users = len(self.users) self.num_items = len(self.items) self.transactions = sum(len(v) for v in self.train_dict.values()) self.private_users = {p: u for p, u in enumerate(self.users)} self.public_users = {v: k for k, v in self.private_users.items()} self.private_items = {p: i for p, i in enumerate(self.items)} self.public_items = {v: k for k, v in self.private_items.items()} self.i_train_dict = {self.public_users[user]: {self.public_items[i]: v for i, v in items.items()} for user, items in self.train_dict.items()} self.sp_i_train = self.build_sparse() self.sp_i_train_ratings = self.build_sparse_ratings() if len(data_tuple) == 2: self.test_dict = self.build_dict(data_tuple[1], self.users) if hasattr(config, "negative_sampling"): val_neg_samples, test_neg_samples = NegativeSampler.sample(config, self.public_users, self.public_items, self.sp_i_train, None, self.test_dict) sp_i_test = self.to_bool_sparse(self.test_dict) test_candidate_items = test_neg_samples + sp_i_test self.test_mask = np.where((test_candidate_items.toarray() == True), True, False) else: self.val_dict = self.build_dict(data_tuple[1], self.users) self.test_dict = self.build_dict(data_tuple[2], self.users) if hasattr(config, "negative_sampling"): val_neg_samples, test_neg_samples = NegativeSampler.sample(config, self.public_users, self.public_items, self.sp_i_train, self.val_dict, self.test_dict) sp_i_val = self.to_bool_sparse(self.val_dict) sp_i_test = self.to_bool_sparse(self.test_dict) val_candidate_items = val_neg_samples + sp_i_val self.val_mask = np.where((val_candidate_items.toarray() == True), True, False) test_candidate_items = test_neg_samples + sp_i_test self.test_mask = np.where((test_candidate_items.toarray() == True), True, False) self.allunrated_mask = np.where((self.sp_i_train.toarray() == 0), True, False)
def __init__(self, data: ds.DataSet, params: SimpleNamespace): """ Class to manage all the evaluation methods and operation :param data: dataset object :param k: top-k evaluation """ self.logger = logging.get_logger( self.__class__.__name__, pylog.CRITICAL if data.config.config_test else pylog.DEBUG) self._data = data self._params = params self._k = getattr(data.config.evaluation, "cutoffs", [data.config.top_k]) self._k = self._k if isinstance(self._k, list) else [self._k] if any(np.array(self._k) > data.config.top_k): raise Exception( "Cutoff values must be smaller than recommendation list length (top_k)" ) self._rel_threshold = data.config.evaluation.relevance_threshold self._paired_ttest = self._data.config.evaluation.paired_ttest self._metrics = metrics.parse_metrics( data.config.evaluation.simple_metrics) self._complex_metrics = getattr(data.config.evaluation, "complex_metrics", dict()) #TODO integrate complex metrics in validation metric (the problem is that usually complex metrics generate a complex name that does not match with the base name when looking for the loss value) # if _validation_metric.lower() not in [m.lower() # for m in data.config.evaluation.simple_metrics]+[m["metric"].lower() # for m in self._complex_metrics]: # raise Exception("Validation metric must be in list of general metrics") self._test = data.get_test() self._pop = popularity_utils.Popularity(self._data) self._evaluation_objects = SimpleNamespace( relevance=relevance.Relevance(self._test, self._rel_threshold), pop=self._pop, num_items=self._data.num_items, data=self._data, additional_metrics=self._complex_metrics) if data.get_validation(): self._val = data.get_validation() self._val_evaluation_objects = SimpleNamespace( relevance=relevance.Relevance(self._val, self._rel_threshold), pop=self._pop, num_items=self._data.num_items, data=self._data, additional_metrics=self._complex_metrics) self._needed_recommendations = self._compute_needed_recommendations()
def __init__(self, data_objs, base: SimpleNamespace, params, model_class: t.ClassVar): """ The constructor creates a Placeholder of the recommender model. :param base: a SimpleNamespace that contains the configuration (main level) options :param params: a SimpleNamespace that contains the hyper-parameters of the model :param model_class: the class of the recommendation model """ self.logger = logging.get_logger( self.__class__.__name__, pylog.CRITICAL if base.config_test else pylog.DEBUG) self.data_objs = data_objs self.base = base self.params = params self.model_class = model_class
def __init__(self, config, data_tuple, *args, **kwargs): """ Constructor of DataSet :param path_train_data: relative path for train file :param path_test_data: relative path for test file """ self.logger = logging.get_logger( self.__class__.__name__, pylog.CRITICAL if config.config_test else pylog.DEBUG) self.config = config self.args = args self.kwargs = kwargs self.train_dict = self.dataframe_to_dict(data_tuple[0]) self.users = list(self.train_dict.keys()) self.items = list( {k for a in self.train_dict.values() for k in a.keys()}) self.num_users = len(self.users) self.num_items = len(self.items) self.transactions = sum(len(v) for v in self.train_dict.values()) self.private_users = {p: u for p, u in enumerate(self.users)} self.public_users = {v: k for k, v in self.private_users.items()} self.private_items = {p: i for p, i in enumerate(self.items)} self.public_items = {v: k for k, v in self.private_items.items()} self.i_train_dict = { self.public_users[user]: {self.public_items[i]: v for i, v in items.items()} for user, items in self.train_dict.items() } self.sp_i_train = self.build_sparse() self.sp_i_train_ratings = self.build_sparse_ratings() if len(data_tuple) == 2: self.test_dict = self.build_dict(data_tuple[1], self.users) else: self.val_dict = self.build_dict(data_tuple[1], self.users) self.test_dict = self.build_dict(data_tuple[2], self.users)
def read_single_image(images_folder, image_set, size_tuple, image_path): image_id = int(image_path.split(".")[0]) if image_id in image_set: try: im_pos = Image.open(os.path.join(images_folder, image_path)) im_pos.load() if im_pos.mode != 'RGB': im_pos = im_pos.convert(mode='RGB') if size_tuple: im_pos = np.array( im_pos.resize(size_tuple)) / np.float32(255) return {image_id: im_pos} except (ValueError, PIL.UnidentifiedImageError) as er: _logger = logging.get_logger(__class__.__name__) _logger.error( f'Image at path {os.path.join(images_folder, image_path)} was not loaded correctly!' ) _logger.error(er)
def __init__(self, config, data_tuple, side_information_data, *args, **kwargs): self.logger = logging.get_logger( self.__class__.__name__, pylog.CRITICAL if config.config_test else pylog.DEBUG) self.config = config self.side_information_data = side_information_data self.args = args self.kwargs = kwargs self.train_dict = self.dataframe_to_dict(data_tuple[0]) if self.side_information_data.visual_feature_path: self.visual_features = np.load( self.side_information_data.visual_feature_path) self.item_mapping = pd.read_csv( self.side_information_data.item_mapping_path, sep="\t", header=None) self.item_mapping = { i: j for i, j in zip(self.item_mapping[0], self.item_mapping[1]) } if self.side_information_data.images_src_folder: self.output_image_size = literal_eval( self.side_information_data.size_tuple ) if self.side_information_data.size_tuple else None self.item_mapping = pd.read_csv( self.side_information_data.item_mapping_path, sep="\t", header=None) self.item_mapping = { i: j for i, j in zip(self.item_mapping[0], self.item_mapping[1]) } # self.image_dict = self.read_images_multiprocessing(self.side_information_data.images_src_folder, self.side_information_data.aligned_items, self.output_image_size) self.users = list(self.train_dict.keys()) self.num_users = len(self.users) self.items = list(self.side_information_data.aligned_items) self.num_items = len(self.items) self.private_users = {p: u for p, u in enumerate(self.users)} self.public_users = {v: k for k, v in self.private_users.items()} self.private_items = {p: i for p, i in enumerate(self.items)} self.public_items = {v: k for k, v in self.private_items.items()} self.transactions = sum(len(v) for v in self.train_dict.values()) self.i_train_dict = { self.public_users[user]: {self.public_items[i]: v for i, v in items.items()} for user, items in self.train_dict.items() } self.sp_i_train = self.build_sparse() self.sp_i_train_ratings = self.build_sparse_ratings() if len(data_tuple) == 2: self.test_dict = self.build_dict(data_tuple[1], self.users) else: self.val_dict = self.build_dict(data_tuple[1], self.users) self.test_dict = self.build_dict(data_tuple[2], self.users)
def __init__(self, config, *args, **kwargs): """ Constructor of DataSet :param path_train_data: relative path for train file :param path_test_data: relative path for test file """ self.logger = logging.get_logger(self.__class__.__name__) self.args = args self.kwargs = kwargs self.config = config self.column_names = ['userId', 'itemId', 'rating', 'timestamp'] if config.config_test: return if config.data_config.strategy == "fixed": path_train_data = config.data_config.train_path path_val_data = getattr(config.data_config, "validation_path", None) path_test_data = config.data_config.test_path self.train_dataframe = pd.read_csv(path_train_data, sep="\t", header=None, names=self.column_names) self.train_dataframe = self.check_timestamp(self.train_dataframe) self.logger.info(f"{path_train_data} - Loaded") self.test_dataframe = pd.read_csv(path_test_data, sep="\t", header=None, names=self.column_names) self.test_dataframe = self.check_timestamp(self.test_dataframe) if path_val_data: self.validation_dataframe = pd.read_csv( path_val_data, sep="\t", header=None, names=self.column_names) self.validation_dataframe = self.check_timestamp( self.validation_dataframe) self.tuple_list = [([ (self.train_dataframe, self.validation_dataframe) ], self.test_dataframe)] else: self.tuple_list = [(self.train_dataframe, self.test_dataframe)] elif config.data_config.strategy == "hierarchy": self.tuple_list = self.read_splitting( config.data_config.root_folder) elif config.data_config.strategy == "dataset": self.logger.info("There will be the splitting") path_dataset = config.data_config.dataset_path self.dataframe = pd.read_csv(path_dataset, sep="\t", header=None, names=self.column_names) self.dataframe = self.check_timestamp(self.dataframe) self.logger.info(('{0} - Loaded'.format(path_dataset))) self.dataframe = PreFilter.filter(self.dataframe, self.config) splitter = Splitter(self.dataframe, self.config.splitting) self.tuple_list = splitter.process_splitting() else: raise Exception("Strategy option not recognized")
def needs_full_recommendations(): _logger = logging.get_logger("Evaluator") _logger.warn("AUC metric requires full length recommendations") return True
def needs_full_recommendations(): _logger = logging.get_logger("Evaluator") _logger.warn( "WARNING: Mean Absolute Error metric requires full length recommendations" ) return True
def __init__(self, config, *args, **kwargs): """ Constructor of DataSet :param path_train_data: relative path for train file :param path_test_data: relative path for test file """ self.logger = logging.get_logger(self.__class__.__name__) self.args = args self.kwargs = kwargs self.config = config self.column_names = ['userId', 'itemId', 'rating', 'timestamp'] if config.config_test: return if config.data_config.strategy == "fixed": path_train_data = config.data_config.train_path path_val_data = getattr(config.data_config, "validation_path", None) path_test_data = config.data_config.test_path visual_feature_path = getattr(config.data_config.side_information, "visual_features", None) item_mapping_path = getattr(config.data_config.side_information, "item_mapping", None) size_tuple = getattr(config.data_config.side_information, "output_image_size", None) if visual_feature_path and item_mapping_path: feature_set = set( pd.read_csv(item_mapping_path, sep="\t", header=None)[0].unique().tolist()) else: feature_set = {} images_src_folder = getattr(config.data_config.side_information, "images_src_folder", None) if images_src_folder: image_set = { int(path.split(".")[0]) for path in os.listdir(images_src_folder) } else: image_set = {} if feature_set and image_set: visual_set = feature_set and image_set elif feature_set: visual_set = feature_set elif image_set: visual_set = image_set else: visual_set = {} self.side_information_data = SimpleNamespace() self.train_dataframe, self.side_information_data.aligned_items = self.load_dataset_dataframe( path_train_data, "\t", visual_set) self.side_information_data.visual_feature_path = visual_feature_path self.side_information_data.item_mapping_path = item_mapping_path self.side_information_data.images_src_folder = images_src_folder self.side_information_data.size_tuple = size_tuple self.train_dataframe = self.check_timestamp(self.train_dataframe) self.logger.info('{0} - Loaded'.format(path_train_data)) self.test_dataframe = pd.read_csv(path_test_data, sep="\t", header=None, names=self.column_names) self.test_dataframe = self.check_timestamp(self.test_dataframe) if path_val_data: self.validation_dataframe = pd.read_csv( path_val_data, sep="\t", header=None, names=self.column_names) self.validation_dataframe = self.check_timestamp( self.validation_dataframe) self.tuple_list = [([ (self.train_dataframe, self.validation_dataframe) ], self.test_dataframe)] else: self.tuple_list = [(self.train_dataframe, self.test_dataframe)] elif config.data_config.strategy == "hierarchy": self.tuple_list = self.read_splitting( config.data_config.root_folder) elif config.data_config.strategy == "dataset": self.logger.info("There will be the splitting") path_dataset = config.data_config.dataset_path visual_feature_path = getattr(config.data_config.side_information, "visual_features", None) item_mapping_path = getattr(config.data_config.side_information, "item_mapping", None) size_tuple = getattr(config.data_config.side_information, "output_image_size", None) if visual_feature_path and item_mapping_path: feature_set = set( pd.read_csv(item_mapping_path, sep="\t", header=None)[0].unique().tolist()) else: feature_set = {} images_src_folder = getattr(config.data_config.side_information, "images_src_folder", None) if images_src_folder: image_set = { int(path.split(".")[0]) for path in os.listdir(images_src_folder) } else: image_set = {} if feature_set and image_set: visual_set = feature_set and image_set elif feature_set: visual_set = feature_set elif image_set: visual_set = image_set else: visual_set = {} self.side_information_data = SimpleNamespace() self.dataframe, self.side_information_data.aligned_items = self.load_dataset_dataframe( path_dataset, "\t", visual_set) self.side_information_data.visual_feature_path = visual_feature_path self.side_information_data.item_mapping_path = item_mapping_path self.side_information_data.images_src_folder = images_src_folder self.side_information_data.size_tuple = size_tuple self.dataframe = self.check_timestamp(self.dataframe) self.logger.info('{0} - Loaded'.format(path_dataset)) self.dataframe = PreFilter.filter(self.dataframe, self.config) splitter = Splitter(self.dataframe, self.config.splitting) self.tuple_list = splitter.process_splitting() else: raise Exception("Strategy option not recognized")
def __init__(self, config, data_tuple, side_information_data, *args, **kwargs): self.logger = logging.get_logger( self.__class__.__name__, pylog.CRITICAL if config.config_test else pylog.DEBUG) self.config = config self.side_information_data = side_information_data self.args = args self.kwargs = kwargs self.train_dict = self.dataframe_to_dict(data_tuple[0]) self.train_pd = data_tuple[0] self.users = list(self.train_dict.keys()) self.num_users = len(self.users) self.items = list( {k for a in self.train_dict.values() for k in a.keys()}) self.num_items = len(self.items) # self.features = list({f for i in self.items for f in self.side_information_data.feature_map[i]}) # self.factors = len(self.features) self.private_users = {p: u for p, u in enumerate(self.users)} self.public_users = {v: k for k, v in self.private_users.items()} self.private_items = {p: i for p, i in enumerate(self.items)} self.public_items = {v: k for k, v in self.private_items.items()} # self.private_features = {p: f for p, f in enumerate(self.features)} # self.public_features = {v: k for k, v in self.private_features.items()} self.transactions = sum(len(v) for v in self.train_dict.values()) self.i_train_dict = { self.public_users[user]: {self.public_items[i]: v for i, v in items.items()} for user, items in self.train_dict.items() } self.sp_i_train = self.build_sparse() self.sp_i_train_ratings = self.build_sparse_ratings() if len(data_tuple) == 2: self.test_dict = self.build_dict(data_tuple[1], self.users) else: self.val_dict = self.build_dict(data_tuple[1], self.users) self.test_dict = self.build_dict(data_tuple[2], self.users) # KaHFM compatible features kgflex_feature_df = self.side_information_data.feature_map.copy() def f(x): return str(x["predicate"]) + "><" + str(x["object"]) kgflex_feature_df["bind"] = kgflex_feature_df.apply(f, axis=1) nitems = kgflex_feature_df["itemId"].nunique() threshold = 0.93 kgflex_feature_df = kgflex_feature_df.groupby('bind').filter( lambda x: (1 - len(x) / nitems) <= threshold) print( f"Number of KaHFM features: {kgflex_feature_df['bind'].nunique()} with Threshold: {threshold}" ) feature_index = { k: p for p, k in enumerate(kgflex_feature_df["bind"].unique()) } kgflex_feature_df["bind2"] = kgflex_feature_df["bind"].map( feature_index) kgflex_feature_df.drop(columns=["bind"], inplace=True) self.side_information_data.kahfm_feature_map = kgflex_feature_df.groupby( "itemId")["bind2"].apply(list).to_dict() self.features = list(set(feature_index.values())) self.private_features = {p: f for p, f in enumerate(self.features)} self.public_features = {v: k for k, v in self.private_features.items()} if len(data_tuple) == 2: self.test_dict = self.build_dict(data_tuple[1], self.users) if hasattr(config, "negative_sampling"): val_neg_samples, test_neg_samples = NegativeSampler.sample( config, self.public_users, self.public_items, self.sp_i_train, None, self.test_dict) sp_i_test = self.to_bool_sparse(self.test_dict) test_candidate_items = test_neg_samples + sp_i_test self.test_mask = np.where( (test_candidate_items.toarray() == True), True, False) else: self.val_dict = self.build_dict(data_tuple[1], self.users) self.test_dict = self.build_dict(data_tuple[2], self.users) if hasattr(config, "negative_sampling"): val_neg_samples, test_neg_samples = NegativeSampler.sample( config, self.public_users, self.public_items, self.sp_i_train, self.val_dict, self.test_dict) sp_i_val = self.to_bool_sparse(self.val_dict) sp_i_test = self.to_bool_sparse(self.test_dict) val_candidate_items = val_neg_samples + sp_i_val self.val_mask = np.where( (val_candidate_items.toarray() == True), True, False) test_candidate_items = test_neg_samples + sp_i_test self.test_mask = np.where( (test_candidate_items.toarray() == True), True, False) self.allunrated_mask = np.where((self.sp_i_train.toarray() == 0), True, False)
def __init__(self, config, *args, **kwargs): """ Constructor of DataSet :param path_train_data: relative path for train file :param path_test_data: relative path for test file """ self.logger = logging.get_logger(self.__class__.__name__) self.args = args self.kwargs = kwargs self.config = config self.column_names = ['userId', 'itemId', 'rating', 'timestamp'] if config.config_test: return self.side_information_data = SimpleNamespace() if config.data_config.strategy == "fixed": path_train_data = config.data_config.train_path path_val_data = getattr(config.data_config, "validation_path", None) path_test_data = config.data_config.test_path work_directory_path = config.data_config.side_information.work_directory map_path = config.data_config.side_information.map features_path = config.data_config.side_information.features predicates_path = config.data_config.side_information.predicates self.train_dataframe, self.side_information_data.feature_map, self.side_information_data.predicate_mapping = self.load_dataset_dataframe( path_train_data, predicates_path, features_path) self.train_dataframe = self.check_timestamp(self.train_dataframe) self.logger.info(f"{path_train_data} - Loaded") self.test_dataframe = pd.read_csv(path_test_data, sep="\t", header=None, names=self.column_names) self.test_dataframe = self.check_timestamp(self.test_dataframe) if config.binarize == True: self.test_dataframe["rating"] = 1 self.train_dataframe["rating"] = 1 if path_val_data: self.validation_dataframe = pd.read_csv( path_val_data, sep="\t", header=None, names=self.column_names) self.validation_dataframe = self.check_timestamp( self.validation_dataframe) self.tuple_list = [([ (self.train_dataframe, self.validation_dataframe) ], self.test_dataframe)] else: self.tuple_list = [(self.train_dataframe, self.test_dataframe)] elif config.data_config.strategy == "hierarchy": item_mapping_path = getattr(config.data_config.side_information, "item_mapping", None) self.side_information_data.feature_map = self.load_attribute_file( item_mapping_path) self.tuple_list = self.read_splitting( config.data_config.root_folder) self.logger.info('{0} - Loaded'.format( config.data_config.root_folder)) elif config.data_config.strategy == "dataset": self.logger.info("There will be the splitting") path_dataset = config.data_config.dataset_path work_directory_path = config.data_config.side_information.work_directory map_path = config.data_config.side_information.map features_path = config.data_config.side_information.features predicates_path = config.data_config.side_information.predicates self.dataframe, self.side_information_data.feature_map, self.side_information_data.predicate_mapping = self.load_dataset_dataframe( path_dataset, predicates_path, features_path) self.dataframe = self.check_timestamp(self.dataframe) self.logger.info(('{0} - Loaded'.format(path_dataset))) self.dataframe = PreFilter.filter(self.dataframe, self.config) if config.binarize == True: self.dataframe["rating"] = 1 splitter = Splitter(self.dataframe, self.config.splitting) self.tuple_list = splitter.process_splitting() else: raise Exception("Strategy option not recognized")
def run_experiment(config_path: str = './config/config.yml'): builder = NameSpaceBuilder(config_path, here, path.abspath(path.dirname(config_path))) base = builder.base config_test(builder, base) logging_project.init(base.base_namespace.path_logger_config, base.base_namespace.path_log_folder) logger = logging_project.get_logger("__main__") logger.info("Start experiment") base.base_namespace.evaluation.relevance_threshold = getattr( base.base_namespace.evaluation, "relevance_threshold", 0) res_handler = ResultHandler( rel_threshold=base.base_namespace.evaluation.relevance_threshold) hyper_handler = HyperParameterStudy( rel_threshold=base.base_namespace.evaluation.relevance_threshold) dataloader_class = getattr(importlib.import_module("elliot.dataset"), base.base_namespace.data_config.dataloader) dataloader = dataloader_class(config=base.base_namespace) data_test_list = dataloader.generate_dataobjects() for key, model_base in builder.models(): test_results = [] test_trials = [] for data_test in data_test_list: logging_project.prepare_logger(key, base.base_namespace.path_log_folder) if key.startswith("external."): spec = importlib.util.spec_from_file_location( "external", path.relpath(base.base_namespace.external_models_path)) external = importlib.util.module_from_spec(spec) sys.modules[spec.name] = external spec.loader.exec_module(external) model_class = getattr(importlib.import_module("external"), key.split(".", 1)[1]) else: model_class = getattr( importlib.import_module("elliot.recommender"), key) model_placeholder = ho.ModelCoordinator(data_test, base.base_namespace, model_base, model_class) if isinstance(model_base, tuple): logger.info(f"Tuning begun for {model_class.__name__}\n") trials = Trials() best = fmin(model_placeholder.objective, space=model_base[1], algo=model_base[3], trials=trials, verbose=False, rstate=_rstate, max_evals=model_base[2]) # argmin relativo alla combinazione migliore di iperparametri min_val = np.argmin( [i["result"]["loss"] for i in trials._trials]) ############################################ best_model_loss = trials._trials[min_val]["result"]["loss"] best_model_params = trials._trials[min_val]["result"]["params"] best_model_results = trials._trials[min_val]["result"][ "test_results"] ############################################ # aggiunta a lista performance test test_results.append(trials._trials[min_val]["result"]) test_trials.append(trials) logger.info(f"Tuning ended for {model_class.__name__}") else: logger.info(f"Training begun for {model_class.__name__}\n") single = model_placeholder.single() ############################################ best_model_loss = single["loss"] best_model_params = single["params"] best_model_results = single["test_results"] ############################################ # aggiunta a lista performance test test_results.append(single) logger.info(f"Training ended for {model_class.__name__}") logger.info(f"Loss:\t{best_model_loss}") logger.info(f"Best Model params:\t{best_model_params}") logger.info(f"Best Model results:\t{best_model_results}") # Migliore sui test, aggiunta a performance totali min_val = np.argmin([i["loss"] for i in test_results]) res_handler.add_oneshot_recommender(**test_results[min_val]) if isinstance(model_base, tuple): hyper_handler.add_trials(test_trials[min_val]) # res_handler.save_results(output=base.base_namespace.path_output_rec_performance) hyper_handler.save_trials( output=base.base_namespace.path_output_rec_performance) res_handler.save_best_results( output=base.base_namespace.path_output_rec_performance) first_metric = base.base_namespace.evaluation.simple_metrics[ 0] if base.base_namespace.evaluation.simple_metrics else "" res_handler.save_best_models( output=base.base_namespace.path_output_rec_performance, default_metric=first_metric) if hasattr(base.base_namespace, "print_results_as_triplets" ) and base.base_namespace.print_results_as_triplets == True: res_handler.save_best_results_as_triplets( output=base.base_namespace.path_output_rec_performance) hyper_handler.save_trials_as_triplets( output=base.base_namespace.path_output_rec_performance) if hasattr(base.base_namespace.evaluation, "paired_ttest") and base.base_namespace.evaluation.paired_ttest: res_handler.save_best_statistical_results( stat_test=StatTest.PairedTTest, output=base.base_namespace.path_output_rec_performance) if hasattr( base.base_namespace.evaluation, "wilcoxon_test") and base.base_namespace.evaluation.wilcoxon_test: res_handler.save_best_statistical_results( stat_test=StatTest.WilcoxonTest, output=base.base_namespace.path_output_rec_performance) logger.info("End experiment")