def __init__(self, config, *args, **kwargs): """ Constructor of DataSet :param path_train_data: relative path for train file :param path_test_data: relative path for test file """ self.logger = logging.get_logger(self.__class__.__name__) self.args = args self.kwargs = kwargs self.config = config self.column_names = ['userId', 'itemId', 'rating', 'timestamp'] if config.config_test: return if config.data_config.strategy == "fixed": path_train_data = config.data_config.train_path path_val_data = getattr(config.data_config, "validation_path", None) path_test_data = config.data_config.test_path visual_feature_path = getattr(config.data_config.side_information, "visual_features", None) item_mapping_path = getattr(config.data_config.side_information, "item_mapping", None) size_tuple = getattr(config.data_config.side_information, "output_image_size", None) if visual_feature_path and item_mapping_path: feature_set = set( pd.read_csv(item_mapping_path, sep="\t", header=None)[0].unique().tolist()) else: feature_set = {} images_src_folder = getattr(config.data_config.side_information, "images_src_folder", None) if images_src_folder: image_set = { int(path.split(".")[0]) for path in os.listdir(images_src_folder) } else: image_set = {} if feature_set and image_set: visual_set = feature_set and image_set elif feature_set: visual_set = feature_set elif image_set: visual_set = image_set else: visual_set = {} self.side_information_data = SimpleNamespace() self.train_dataframe, self.side_information_data.aligned_items = self.load_dataset_dataframe( path_train_data, "\t", visual_set) self.side_information_data.visual_feature_path = visual_feature_path self.side_information_data.item_mapping_path = item_mapping_path self.side_information_data.images_src_folder = images_src_folder self.side_information_data.size_tuple = size_tuple self.train_dataframe = self.check_timestamp(self.train_dataframe) self.logger.info('{0} - Loaded'.format(path_train_data)) self.test_dataframe = pd.read_csv(path_test_data, sep="\t", header=None, names=self.column_names) self.test_dataframe = self.check_timestamp(self.test_dataframe) if path_val_data: self.validation_dataframe = pd.read_csv( path_val_data, sep="\t", header=None, names=self.column_names) self.validation_dataframe = self.check_timestamp( self.validation_dataframe) self.tuple_list = [([ (self.train_dataframe, self.validation_dataframe) ], self.test_dataframe)] else: self.tuple_list = [(self.train_dataframe, self.test_dataframe)] elif config.data_config.strategy == "hierarchy": self.tuple_list = self.read_splitting( config.data_config.root_folder) elif config.data_config.strategy == "dataset": self.logger.info("There will be the splitting") path_dataset = config.data_config.dataset_path visual_feature_path = getattr(config.data_config.side_information, "visual_features", None) item_mapping_path = getattr(config.data_config.side_information, "item_mapping", None) size_tuple = getattr(config.data_config.side_information, "output_image_size", None) if visual_feature_path and item_mapping_path: feature_set = set( pd.read_csv(item_mapping_path, sep="\t", header=None)[0].unique().tolist()) else: feature_set = {} images_src_folder = getattr(config.data_config.side_information, "images_src_folder", None) if images_src_folder: image_set = { int(path.split(".")[0]) for path in os.listdir(images_src_folder) } else: image_set = {} if feature_set and image_set: visual_set = feature_set and image_set elif feature_set: visual_set = feature_set elif image_set: visual_set = image_set else: visual_set = {} self.side_information_data = SimpleNamespace() self.dataframe, self.side_information_data.aligned_items = self.load_dataset_dataframe( path_dataset, "\t", visual_set) self.side_information_data.visual_feature_path = visual_feature_path self.side_information_data.item_mapping_path = item_mapping_path self.side_information_data.images_src_folder = images_src_folder self.side_information_data.size_tuple = size_tuple self.dataframe = self.check_timestamp(self.dataframe) self.logger.info('{0} - Loaded'.format(path_dataset)) self.dataframe = PreFilter.filter(self.dataframe, self.config) splitter = Splitter(self.dataframe, self.config.splitting) self.tuple_list = splitter.process_splitting() else: raise Exception("Strategy option not recognized")
def __init__(self, config, *args, **kwargs): """ Constructor of DataSet :param path_train_data: relative path for train file :param path_test_data: relative path for test file """ self.logger = logging.get_logger(self.__class__.__name__) self.args = args self.kwargs = kwargs self.config = config self.column_names = ['userId', 'itemId', 'rating', 'timestamp'] if config.config_test: return if config.data_config.strategy == "fixed": path_train_data = config.data_config.train_path path_val_data = getattr(config.data_config, "validation_path", None) path_test_data = config.data_config.test_path self.train_dataframe = pd.read_csv(path_train_data, sep="\t", header=None, names=self.column_names) self.train_dataframe = self.check_timestamp(self.train_dataframe) self.logger.info(f"{path_train_data} - Loaded") self.test_dataframe = pd.read_csv(path_test_data, sep="\t", header=None, names=self.column_names) self.test_dataframe = self.check_timestamp(self.test_dataframe) if path_val_data: self.validation_dataframe = pd.read_csv( path_val_data, sep="\t", header=None, names=self.column_names) self.validation_dataframe = self.check_timestamp( self.validation_dataframe) self.tuple_list = [([ (self.train_dataframe, self.validation_dataframe) ], self.test_dataframe)] else: self.tuple_list = [(self.train_dataframe, self.test_dataframe)] elif config.data_config.strategy == "hierarchy": self.tuple_list = self.read_splitting( config.data_config.root_folder) elif config.data_config.strategy == "dataset": self.logger.info("There will be the splitting") path_dataset = config.data_config.dataset_path self.dataframe = pd.read_csv(path_dataset, sep="\t", header=None, names=self.column_names) self.dataframe = self.check_timestamp(self.dataframe) self.logger.info(('{0} - Loaded'.format(path_dataset))) self.dataframe = PreFilter.filter(self.dataframe, self.config) splitter = Splitter(self.dataframe, self.config.splitting) self.tuple_list = splitter.process_splitting() else: raise Exception("Strategy option not recognized")
def __init__(self, config, *args, **kwargs): """ Constructor of DataSet :param path_train_data: relative path for train file :param path_test_data: relative path for test file """ self.logger = logging.get_logger(self.__class__.__name__) self.args = args self.kwargs = kwargs self.config = config self.column_names = ['userId', 'itemId', 'rating', 'timestamp'] if config.config_test: return self.side_information_data = SimpleNamespace() if config.data_config.strategy == "fixed": path_train_data = config.data_config.train_path path_val_data = getattr(config.data_config, "validation_path", None) path_test_data = config.data_config.test_path work_directory_path = config.data_config.side_information.work_directory map_path = config.data_config.side_information.map features_path = config.data_config.side_information.features predicates_path = config.data_config.side_information.predicates self.train_dataframe, self.side_information_data.feature_map, self.side_information_data.predicate_mapping = self.load_dataset_dataframe( path_train_data, predicates_path, features_path) self.train_dataframe = self.check_timestamp(self.train_dataframe) self.logger.info(f"{path_train_data} - Loaded") self.test_dataframe = pd.read_csv(path_test_data, sep="\t", header=None, names=self.column_names) self.test_dataframe = self.check_timestamp(self.test_dataframe) if config.binarize == True: self.test_dataframe["rating"] = 1 self.train_dataframe["rating"] = 1 if path_val_data: self.validation_dataframe = pd.read_csv( path_val_data, sep="\t", header=None, names=self.column_names) self.validation_dataframe = self.check_timestamp( self.validation_dataframe) self.tuple_list = [([ (self.train_dataframe, self.validation_dataframe) ], self.test_dataframe)] else: self.tuple_list = [(self.train_dataframe, self.test_dataframe)] elif config.data_config.strategy == "hierarchy": item_mapping_path = getattr(config.data_config.side_information, "item_mapping", None) self.side_information_data.feature_map = self.load_attribute_file( item_mapping_path) self.tuple_list = self.read_splitting( config.data_config.root_folder) self.logger.info('{0} - Loaded'.format( config.data_config.root_folder)) elif config.data_config.strategy == "dataset": self.logger.info("There will be the splitting") path_dataset = config.data_config.dataset_path work_directory_path = config.data_config.side_information.work_directory map_path = config.data_config.side_information.map features_path = config.data_config.side_information.features predicates_path = config.data_config.side_information.predicates self.dataframe, self.side_information_data.feature_map, self.side_information_data.predicate_mapping = self.load_dataset_dataframe( path_dataset, predicates_path, features_path) self.dataframe = self.check_timestamp(self.dataframe) self.logger.info(('{0} - Loaded'.format(path_dataset))) self.dataframe = PreFilter.filter(self.dataframe, self.config) if config.binarize == True: self.dataframe["rating"] = 1 splitter = Splitter(self.dataframe, self.config.splitting) self.tuple_list = splitter.process_splitting() else: raise Exception("Strategy option not recognized")