def process_rated(self, user_ratings: pd.DataFrame, index_directory: str): """ Function that extracts features from positive rated items ONLY! The extracted features will be used to fit the algorithm (build the query). Features extracted will be stored in private attributes of the class. Args: user_ratings (pd.DataFrame): DataFrame containing ratings of a single user index_directory (str): path of the index folder """ threshold = self.threshold if threshold is None: threshold = self._calc_mean_user_threshold(user_ratings) # Initializes positive_user_docs which is a dictionary that has the document_id as key and # another dictionary as value. The dictionary value has the name of the field as key # and its contents as value. By doing so we obtain the data of the fields while # also storing information regarding the field and the document where it was scores = [] positive_user_docs = {} recsys_logger.info("Processing rated items") ix = SearchIndex(index_directory) for item_id, score in zip(user_ratings.to_id, user_ratings.score): if score >= threshold: # {item_id: {"item": item_dictionary, "score": item_score}} item_query = ix.query(item_id, 1, classic_similarity=self.__classic_similarity) if len(item_query) != 0: item = item_query.pop(item_id).get('item') scores.append(score) positive_user_docs[item_id] = self.__get_representations(item) self.__positive_user_docs = positive_user_docs self.__scores = scores
def test_create_contents_in_index(self): output_dir = os.path.join(THIS_DIR, "movielens_test_original_index") movies_ca_config = ItemAnalyzerConfig( source=JSONFile(movies_info_reduced), id=['imdbID'], output_directory=output_dir, ) movies_ca_config.add_multiple_config( field_name='Title', config_list=[ FieldConfig(OriginalData(), NLTK(lemmatization=True, stopwords_removal=True), SearchIndex(os.path.join(output_dir, "index")), "test_search"), FieldConfig(SkLearnTfIdf(), NLTK(), KeywordIndex(os.path.join(output_dir, "index1")), "test_keyword"), FieldConfig(OriginalData(), NLTK(), SearchIndex(os.path.join(output_dir, "index"))) ]) content_analyzer = ContentAnalyzer(movies_ca_config) content_analyzer.fit() for name in os.listdir(THIS_DIR): if os.path.isdir(os.path.join(THIS_DIR, name)) \ and 'movielens_test_original_index' in str(name): with lzma.open(os.path.join(THIS_DIR, name, 'tt0113497.xz'), 'r') as file: content = pickle.load(file) self.assertIsInstance( content.get_field("Title")[0], IndexField) self.assertIsInstance( content.get_field("Title")[0].value, str) self.assertIsInstance( content.get_field("Title")[1], IndexField) self.assertIsInstance( content.get_field("Title")[1].value, str) break
def rank(self, user_ratings: pd.DataFrame, index_directory: str, recs_number: int = None, filter_list: List[str] = None) -> pd.DataFrame: """ Rank the top-n recommended items for the user. If the recs_number parameter isn't specified, All items will be ranked. One can specify which items must be ranked with the filter_list parameter, in this case ONLY items in the filter_list will be used to calculate the rank. One can also pass items already seen by the user with the filter_list parameter. Otherwise, ALL unrated items will be used to calculate the rank. Args: user_ratings (pd.DataFrame): DataFrame containing ratings of a single user items_directory (str): path of the directory where the items are stored recs_number (int): number of the top items that will be present in the ranking filter_list (list): list of the items to rank, if None all unrated items will be used to calculate the rank Returns: pd.DataFrame: DataFrame containing one column with the items name, one column with the rating predicted, sorted in descending order by the 'rating' column """ recsys_logger.info("Calculating rank") mask_list = self._build_mask_list(user_ratings, filter_list) ix = SearchIndex(index_directory) score_docs = ix.query(self.__string_query, recs_number, mask_list, filter_list, self.__classic_similarity) results = {'to_id': [], 'score': []} for result in score_docs: results['to_id'].append(result) results['score'].append(score_docs[result]['score']) return pd.DataFrame(results)
def test_init_writing(self): index1 = SearchIndex("./init_writing") index2 = SearchIndex("./init_writing") index3 = SearchIndex("./init_writing") try: index1.init_writing() index1.new_content() index1.new_field("content_id", "0") index1.new_field("init_writing", "test1") index1.serialize_content() index1.stop_writing() # test to check that init_writing with False argument doesn't replace the old index but opens it index2.init_writing(False) index2.new_content() index2.new_field("content_id", "1") index2.new_field("init_writing", "test2") index2.serialize_content() index2.stop_writing() self.assertEqual(index2.get_field("init_writing", "0"), "test1") self.assertEqual(index2.get_field("init_writing", "1"), "test2") # test to check that init_writing with True argument replaces the old index index3.init_writing(True) index3.new_content() index3.new_field("content_id", "0") index3.new_field("init_writing", "test3") index3.serialize_content() index3.stop_writing() self.assertEqual(index3.get_field("init_writing", "0"), "test3") with self.assertRaises(IndexError): index3.get_field("init_writing", "1") finally: index1.delete() index2.delete() index3.delete()
def test_search_serialize(self): index = SearchIndex("search") try: index.init_writing() index.new_content() index.new_field("content_id", "0") index.new_field("test1", "This is A test") index.new_field("test2", "this is a test for the Text Interface") index.serialize_content() index.stop_writing() self.assertEqual(index.get_field("test1", "0"), "This is A test") finally: index.delete()
def test_query(self): index = SearchIndex("testing_query") try: index.init_writing() index.new_content() index.new_field("content_id", "0") index.new_field("test1", "this is a test for the query on the index") index.new_field("test2", "this is the second field") index.serialize_content() index.new_content() index.new_field("content_id", "1") index.new_field("test1", "field") index.serialize_content() index.new_content() index.new_field("content_id", "2") index.new_field("test1", "query on the index") index.serialize_content() index.stop_writing() # test for querying the index result = index.query("test1:(query on the index)", 2, ["2"], ["0", "1"], True) self.assertEqual(len(result), 1) self.assertEqual(result["0"]["item"]["test1"], "this is a test for the query on the index") finally: index.delete()