def setUp(self): """ Setup method that is called at the beginning of each test. """ self.documents, self.users = 18, 10 documents_cnt, users_cnt = self.documents, self.users self.n_iterations = 15 self.k_folds = 3 self.hyperparameters = {'n_factors': 5, '_lambda': 0.01} self.options = {'n_iterations': self.n_iterations, 'k_folds': self.k_folds} self.initializer = ModelInitializer(self.hyperparameters.copy(), self.n_iterations) self.n_recommendations = 1 def mock_get_ratings_matrix(self=None): return [[int(not bool((article + user) % 3)) for article in range(documents_cnt)] for user in range(users_cnt)] self.ratings_matrix = numpy.array(mock_get_ratings_matrix()) setattr(DataParser, "get_ratings_matrix", mock_get_ratings_matrix) self.evaluator = Evaluator(self.ratings_matrix) self.cf = CollaborativeFiltering(self.initializer, self.evaluator, self.hyperparameters, self.options, load_matrices=True) self.cf.train() self.cf.evaluator.k_folds = self.k_folds self.test_data = self.cf.test_data self.predictions = self.cf.get_predictions() self.rounded_predictions = self.cf.rounded_predictions()
def setUp(self): """ Setup method that is called at the beginning of each test. """ self.documents, self.users = 8, 10 documents_cnt, users_cnt = self.documents, self.users self.n_iterations = 5 self.n_factors = 5 self.k_folds = 5 self.hyperparameters = {'n_factors': self.n_factors} self.options = {'n_iterations': self.n_iterations, 'k_folds': self.k_folds} self.initializer = ModelInitializer(self.hyperparameters.copy(), self.n_iterations) def mock_process(self=None): pass def mock_get_abstracts(self=None): return {0: 'hell world berlin dna evolution', 1: 'freiburg is green', 2: 'the best dna is the dna of dinasours', 3: 'truth is absolute', 4: 'berlin is not that green', 5: 'truth manifests itself', 6: 'plato said truth is beautiful', 7: 'freiburg has dna'} def mock_get_ratings_matrix(self=None): return [[int(not bool((article + user) % 3)) for article in range(documents_cnt)] for user in range(users_cnt)] def mock_get_word_distribution(self=None): abstracts = mock_get_abstracts() vocab = set(itertools.chain(*list(map(lambda ab: ab.split(' '), abstracts.values())))) w2i = dict(zip(vocab, range(len(vocab)))) word_to_count = [(w2i[word], sum(abstract.split(' ').count(word) for doc_id, abstract in abstracts.items())) for word in vocab] article_to_word = list(set([(doc_id, w2i[word]) for doc_id, abstract in abstracts.items() for word in abstract.split(' ')])) article_to_word_to_count = list(set([(doc_id, w2i[word], abstract.count(word)) for doc_id, abstract in abstracts.items() for word in abstract.split(' ')])) return word_to_count, article_to_word, article_to_word_to_count abstracts = mock_get_abstracts() word_to_count, article_to_word, article_to_word_to_count = mock_get_word_distribution() self.abstracts_preprocessor = AbstractsPreprocessor(abstracts, word_to_count, article_to_word, article_to_word_to_count) self.ratings_matrix = numpy.array(mock_get_ratings_matrix()) self.evaluator = Evaluator(self.ratings_matrix, self.abstracts_preprocessor) setattr(DataParser, "get_abstracts", mock_get_abstracts) setattr(DataParser, "process", mock_process) setattr(DataParser, "get_ratings_matrix", mock_get_ratings_matrix) setattr(DataParser, "get_word_distribution", mock_get_word_distribution)
def setUp(self): """ Setup method that is called at the beginning of each test. """ self.documents, self.users = 30, 4 documents_cnt, users_cnt = self.documents, self.users self.n_factors = 5 self.n_iterations = 20 self.k_folds = 3 self.hyperparameters = {'n_factors': self.n_factors, '_lambda': 0.01} self.options = {'k_folds': self.k_folds, 'n_iterations': self.n_iterations} self.initializer = ModelInitializer(self.hyperparameters.copy(), self.n_iterations) def mock_get_ratings_matrix(self=None): return [[int(not bool((article + user) % 3)) for article in range(documents_cnt)] for user in range(users_cnt)] self.ratings_matrix = numpy.array(mock_get_ratings_matrix()) self.evaluator = Evaluator(self.ratings_matrix) setattr(DataParser, "get_ratings_matrix", mock_get_ratings_matrix)
def runTest(self): users_cnt, documents_cnt = self.users, self.documents config = RecommenderConfiguration().get_hyperparameters() config['n_factors'] = 5 initializer = ModelInitializer(config, 1) path = initializer._create_path('user_v', (users_cnt, documents_cnt)) self.assertTrue(path.endswith('n_iterations-1,n_rows-10user_v.dat')) matrix_shape = (users_cnt, config['n_factors']) users_mat = numpy.random.random(matrix_shape) initializer.save_matrix(users_mat, 'user_v') self.assertTrue(os.path.isfile(path)) loaded, loaded_matrix = initializer.load_matrix( config, 'user_v', matrix_shape) self.assertTrue(loaded) self.assertTrue(numpy.alltrue(loaded_matrix == users_mat))
def setUp(self): """ Setup method that is called at the beginning of each test. """ self.documents, self.users = 8, 10 documents_cnt, users_cnt = self.documents, self.users self.hyperparameters = { '_lambda': [0.0001, 0.1], 'n_factors': [10, 20] } self.n_iterations = 15 self.options = {'n_iterations': self.n_iterations, 'k_folds': 1} self.initial_config = {'_lambda': 0, 'n_factors': 10} self.initializer = ModelInitializer(self.initial_config.copy(), self.n_iterations) def mock_get_ratings_matrix(self=None): return [[ int(not bool((article + user) % 3)) for article in range(documents_cnt) ] for user in range(users_cnt)] self.ratings_matrix = numpy.array(mock_get_ratings_matrix()) setattr(DataParser, "get_ratings_matrix", mock_get_ratings_matrix)
def __init__(self, initializer=None, abstracts_preprocessor=None, ratings=None, config=None, process_parser=False, verbose=False, load_matrices=True, dump_matrices=True, train_more=True, random_seed=False, results_file_name='top_recommendations'): """ Constructor of the RecommenderSystem. :param ModelInitializer initializer: A model initializer. :param AbstractsPreprocessor abstracts_preprocessor: A preprocessor of abstracts, if None then queried. :param int[][] ratings: Ratings matrix; if None, matrix gets queried from the database. :param boolean process_parser: A Flag deceiding process the dataparser. :param boolean verbose: A flag deceiding to print progress. :param boolean dump_matrices: A flag for saving output matrices. :param boolean train_more: train_more the collaborative filtering after loading matrices. :param boolean random_seed: A flag to determine if we will use random seed or not. :param str results_file_name: Top recommendations results' file name """ if process_parser: DataParser.process() if ratings is None: self.ratings = numpy.array(DataParser.get_ratings_matrix()) else: self.ratings = ratings if abstracts_preprocessor is None: self.abstracts_preprocessor = AbstractsPreprocessor( DataParser.get_abstracts(), *DataParser.get_word_distribution()) else: self.abstracts_preprocessor = abstracts_preprocessor # Get configurations self.config = RecommenderConfiguration(config) # Set flags self.results_file_name = results_file_name + '.dat' self._verbose = verbose self._dump_matrices = dump_matrices self._load_matrices = load_matrices self._train_more = train_more self._split_type = 'user' self._random_seed = random_seed self.set_hyperparameters(self.config.get_hyperparameters()) self.set_options(self.config.get_options()) self.initializer = ModelInitializer(self.hyperparameters.copy(), self.n_iter, self._verbose) if self.config.get_error_metric() == 'RMS': self.evaluator = Evaluator(self.ratings, self.abstracts_preprocessor, self._random_seed, self._verbose) else: raise NameError( "Not a valid error metric %s. Only option is 'RMS'" % self.config.get_error_metric()) # Initialize content based. if self.config.get_content_based() == 'None': self.content_based = ContentBased(self.initializer, self.evaluator, self.hyperparameters, self.options, self._verbose, self._load_matrices, self._dump_matrices) elif self.config.get_content_based() == 'LDA': self.content_based = LDARecommender(self.initializer, self.evaluator, self.hyperparameters, self.options, self._verbose, self._load_matrices, self._dump_matrices) elif self.config.get_content_based() == 'LDA2Vec': self.content_based = LDA2VecRecommender( self.initializer, self.evaluator, self.hyperparameters, self.options, self._verbose, self._load_matrices, self._dump_matrices) else: raise NameError( "Not a valid content based %s. Options are 'None', " "'LDA', 'LDA2Vec'" % self.config.get_content_based()) # Initialize collaborative filtering. if self.config.get_collaborative_filtering() == 'ALS': is_hybrid = self.config.get_recommender() == 'hybrid' if self.config.get_content_based() == 'None': raise NameError( "Not valid content based 'None' with hybrid recommender") self.collaborative_filtering = CollaborativeFiltering( self.initializer, self.evaluator, self.hyperparameters, self.options, self._verbose, self._load_matrices, self._dump_matrices, self._train_more, is_hybrid) elif self.config.get_collaborative_filtering() == 'SDAE': self.collaborative_filtering = SDAERecommender( self.initializer, self.evaluator, self.hyperparameters, self.options, self._verbose, self._load_matrices, self._dump_matrices) if not self.config.get_content_based() == 'None': raise NameError( "Not a valid content based %s with SDAE. You can only use 'None'" % self.config.get_content_based()) elif self.config.get_collaborative_filtering() == 'None': if not self.config.get_recommender() == 'itembased': raise NameError( "None collaborative filtering is only valid with itembased recommender type" ) elif self.config.get_content_based() == 'None': raise NameError( "Not valid content based 'None' with item-based recommender" ) self.collaborative_filtering = None else: raise NameError("Not a valid collaborative filtering %s. " "Only options are 'None', 'ALS', 'SDAE'" % self.config.get_collaborative_filtering()) # Initialize recommender if self.config.get_recommender() == 'itembased': self.recommender = self.content_based elif self.config.get_recommender() == 'userbased': self.recommender = self.collaborative_filtering elif self.config.get_recommender() == 'hybrid': self.recommender = self else: raise NameError( "Invalid recommender type %s. " "Only options are 'userbased','itembased', and 'hybrid'" % self.config.get_recommender())
def __init__(self, use_database=True, verbose=True, load_matrices=True, dump=True, train_more=True, random_seed=False, config=None): """ Setup the data and configuration for the recommenders. """ if use_database: self.ratings = numpy.array(DataParser.get_ratings_matrix()) self.documents, self.users = self.ratings.shape self.abstracts_preprocessor = AbstractsPreprocessor( DataParser.get_abstracts(), *DataParser.get_word_distribution()) else: abstracts = { 0: 'hell world berlin dna evolution', 1: 'freiburg is green', 2: 'the best dna is the dna of dinasours', 3: 'truth is absolute', 4: 'berlin is not that green', 5: 'truth manifests itself', 6: 'plato said truth is beautiful', 7: 'freiburg has dna' } vocab = set( itertools.chain( *list(map(lambda ab: ab.split(' '), abstracts.values())))) w2i = dict(zip(vocab, range(len(vocab)))) word_to_count = [(w2i[word], sum( abstract.split(' ').count(word) for doc_id, abstract in abstracts.items())) for word in vocab] article_to_word = list( set([(doc_id, w2i[word]) for doc_id, abstract in abstracts.items() for word in abstract.split(' ')])) article_to_word_to_count = list( set([(doc_id, w2i[word], abstract.count(word)) for doc_id, abstract in abstracts.items() for word in abstract.split(' ')])) self.abstracts_preprocessor = AbstractsPreprocessor( abstracts, word_to_count, article_to_word, article_to_word_to_count) self.documents, self.users = 8, 10 self.ratings = numpy.array([[ int(not bool((article + user) % 3)) for article in range(self.documents) ] for user in range(self.users)]) self.verbose = verbose self.load_matrices = load_matrices self.dump = dump self.train_more = train_more self.random_seed = random_seed self.evaluator = Evaluator(self.ratings, self.abstracts_preprocessor, self.random_seed, self.verbose) self.config = RecommenderConfiguration() self.hyperparameters = self.config.get_hyperparameters() self.options = self.config.get_options() self.initializer = ModelInitializer(self.hyperparameters.copy(), self.options['n_iterations'], self.verbose)
def __init__(self, initializer=None, abstracts_preprocessor=None, ratings=None, config=None, process_parser=False, verbose=False, load_matrices=True, dump_matrices=True, train_more=True): """ Constructor of the RecommenderSystem. :param ModelInitializer initializer: A model initializer. :param AbstractsPreprocessor abstracts_preprocessor: A preprocessor of abstracts, if None then queried. :param int[][] ratings: Ratings matrix; if None, matrix gets queried from the database. :param boolean process_parser: A Flag deceiding process the dataparser. :param boolean verbose: A flag deceiding to print progress. :param boolean dump_matrices: A flag for saving output matrices. :param boolean train_more: train_more the collaborative filtering after loading matrices. """ if process_parser: DataParser.process() if ratings is None: self.ratings = numpy.array(DataParser.get_ratings_matrix()) else: self.ratings = ratings if abstracts_preprocessor is None: self.abstracts_preprocessor = AbstractsPreprocessor(DataParser.get_abstracts(), *DataParser.get_word_distribution()) else: self.abstracts_preprocessor = abstracts_preprocessor # Get configurations self.config = RecommenderConfiguration(config) self.set_hyperparameters(self.config.get_hyperparameters()) self.set_options(self.config.get_options()) # Set flags self._verbose = verbose self._dump_matrices = dump_matrices self._load_matrices = load_matrices self._train_more = train_more self.initializer = ModelInitializer(self.hyperparameters.copy(), self.n_iter, self._verbose) if self.config.get_error_metric() == 'RMS': self.evaluator = Evaluator(self.ratings, self.abstracts_preprocessor) else: raise NameError("Not a valid error metric %s. Only option is 'RMS'" % self.config.get_error_metric()) # Initialize content based. if self.config.get_content_based() == 'None': self.content_based = ContentBased(self.initializer, self.evaluator, self.hyperparameters, self.options, self._verbose, self._load_matrices, self._dump_matrices) elif self.config.get_content_based() == 'LDA': self.content_based = LDARecommender(self.initializer, self.evaluator, self.hyperparameters, self.options, self._verbose, self._load_matrices, self._dump_matrices) elif self.config.get_content_based() == 'LDA2Vec': self.content_based = LDA2VecRecommender(self.initializer, self.evaluator, self.hyperparameters, self.options, self._verbose, self._load_matrices, self._dump_matrices) else: raise NameError("Not a valid content based %s. Options are 'None', " "'LDA', 'LDA2Vec'" % self.config.get_content_based()) # Initialize collaborative filtering. if self.config.get_collaborative_filtering() == 'ALS': self.collaborative_filtering = CollaborativeFiltering(self.initializer, self.evaluator, self.hyperparameters, self.options, self._verbose, self._load_matrices, self._dump_matrices, self._train_more) else: raise NameError("Not a valid collaborative filtering %s. " "Only option is 'ALS'" % self.config.get_collaborative_filtering()) # Initialize recommender if self.config.get_recommender() == 'itembased': self.recommender = self.content_based elif self.config.get_recommender() == 'userbased': self.recommender = self.collaborative_filtering else: raise NameError("Invalid recommender type %s. " "Only options are 'userbased' and 'itembased'" % self.config.get_recommender())