コード例 #1
0
    def process_rated(self, user_ratings: pd.DataFrame, index_directory: str):
        """
        Function that extracts features from positive rated items ONLY!
        The extracted features will be used to fit the algorithm (build the query).

        Features extracted will be stored in private attributes of the class.

        Args:
            user_ratings (pd.DataFrame): DataFrame containing ratings of a single user
            index_directory (str): path of the index folder
        """
        threshold = self.threshold
        if threshold is None:
            threshold = self._calc_mean_user_threshold(user_ratings)

        # Initializes positive_user_docs which is a dictionary that has the document_id as key and
        # another dictionary as value. The dictionary value has the name of the field as key
        # and its contents as value. By doing so we obtain the data of the fields while
        # also storing information regarding the field and the document where it was
        scores = []
        positive_user_docs = {}

        recsys_logger.info("Processing rated items")
        ix = SearchIndex(index_directory)
        for item_id, score in zip(user_ratings.to_id, user_ratings.score):
            if score >= threshold:
                # {item_id: {"item": item_dictionary, "score": item_score}}
                item_query = ix.query(item_id, 1, classic_similarity=self.__classic_similarity)
                if len(item_query) != 0:
                    item = item_query.pop(item_id).get('item')
                    scores.append(score)
                    positive_user_docs[item_id] = self.__get_representations(item)

        self.__positive_user_docs = positive_user_docs
        self.__scores = scores
コード例 #2
0
    def test_create_contents_in_index(self):
        output_dir = os.path.join(THIS_DIR, "movielens_test_original_index")
        movies_ca_config = ItemAnalyzerConfig(
            source=JSONFile(movies_info_reduced),
            id=['imdbID'],
            output_directory=output_dir,
        )

        movies_ca_config.add_multiple_config(
            field_name='Title',
            config_list=[
                FieldConfig(OriginalData(),
                            NLTK(lemmatization=True, stopwords_removal=True),
                            SearchIndex(os.path.join(output_dir, "index")),
                            "test_search"),
                FieldConfig(SkLearnTfIdf(), NLTK(),
                            KeywordIndex(os.path.join(output_dir, "index1")),
                            "test_keyword"),
                FieldConfig(OriginalData(), NLTK(),
                            SearchIndex(os.path.join(output_dir, "index")))
            ])

        content_analyzer = ContentAnalyzer(movies_ca_config)
        content_analyzer.fit()

        for name in os.listdir(THIS_DIR):
            if os.path.isdir(os.path.join(THIS_DIR, name)) \
                    and 'movielens_test_original_index' in str(name):

                with lzma.open(os.path.join(THIS_DIR, name, 'tt0113497.xz'),
                               'r') as file:
                    content = pickle.load(file)

                    self.assertIsInstance(
                        content.get_field("Title")[0], IndexField)
                    self.assertIsInstance(
                        content.get_field("Title")[0].value, str)
                    self.assertIsInstance(
                        content.get_field("Title")[1], IndexField)
                    self.assertIsInstance(
                        content.get_field("Title")[1].value, str)
                    break
コード例 #3
0
    def rank(self,
             user_ratings: pd.DataFrame,
             index_directory: str,
             recs_number: int = None,
             filter_list: List[str] = None) -> pd.DataFrame:
        """
        Rank the top-n recommended items for the user. If the recs_number parameter isn't specified,
        All items will be ranked.

        One can specify which items must be ranked with the filter_list parameter,
        in this case ONLY items in the filter_list will be used to calculate the rank.
        One can also pass items already seen by the user with the filter_list parameter.
        Otherwise, ALL unrated items will be used to calculate the rank.

        Args:
            user_ratings (pd.DataFrame): DataFrame containing ratings of a single user
            items_directory (str): path of the directory where the items are stored
            recs_number (int): number of the top items that will be present in the ranking
            filter_list (list): list of the items to rank, if None all unrated items will be used to
                calculate the rank
        Returns:
            pd.DataFrame: DataFrame containing one column with the items name,
                one column with the rating predicted, sorted in descending order by the 'rating' column
        """
        recsys_logger.info("Calculating rank")

        mask_list = self._build_mask_list(user_ratings, filter_list)

        ix = SearchIndex(index_directory)
        score_docs = ix.query(self.__string_query, recs_number, mask_list,
                              filter_list, self.__classic_similarity)

        results = {'to_id': [], 'score': []}

        for result in score_docs:

            results['to_id'].append(result)
            results['score'].append(score_docs[result]['score'])

        return pd.DataFrame(results)
コード例 #4
0
    def test_init_writing(self):
        index1 = SearchIndex("./init_writing")
        index2 = SearchIndex("./init_writing")
        index3 = SearchIndex("./init_writing")

        try:
            index1.init_writing()
            index1.new_content()
            index1.new_field("content_id", "0")
            index1.new_field("init_writing", "test1")
            index1.serialize_content()
            index1.stop_writing()

            # test to check that init_writing with False argument doesn't replace the old index but opens it
            index2.init_writing(False)
            index2.new_content()
            index2.new_field("content_id", "1")
            index2.new_field("init_writing", "test2")
            index2.serialize_content()
            index2.stop_writing()
            self.assertEqual(index2.get_field("init_writing", "0"), "test1")
            self.assertEqual(index2.get_field("init_writing", "1"), "test2")

            # test to check that init_writing with True argument replaces the old index
            index3.init_writing(True)
            index3.new_content()
            index3.new_field("content_id", "0")
            index3.new_field("init_writing", "test3")
            index3.serialize_content()
            index3.stop_writing()
            self.assertEqual(index3.get_field("init_writing", "0"), "test3")
            with self.assertRaises(IndexError):
                index3.get_field("init_writing", "1")
        finally:
            index1.delete()
            index2.delete()
            index3.delete()
コード例 #5
0
 def test_search_serialize(self):
     index = SearchIndex("search")
     try:
         index.init_writing()
         index.new_content()
         index.new_field("content_id", "0")
         index.new_field("test1", "This is A test")
         index.new_field("test2", "this is a test for the Text Interface")
         index.serialize_content()
         index.stop_writing()
         self.assertEqual(index.get_field("test1", "0"), "This is A test")
     finally:
         index.delete()
コード例 #6
0
    def test_query(self):
        index = SearchIndex("testing_query")
        try:
            index.init_writing()
            index.new_content()
            index.new_field("content_id", "0")
            index.new_field("test1", "this is a test for the query on the index")
            index.new_field("test2", "this is the second field")
            index.serialize_content()
            index.new_content()
            index.new_field("content_id", "1")
            index.new_field("test1", "field")
            index.serialize_content()
            index.new_content()
            index.new_field("content_id", "2")
            index.new_field("test1", "query on the index")
            index.serialize_content()
            index.stop_writing()

            # test for querying the index
            result = index.query("test1:(query on the index)", 2, ["2"], ["0", "1"], True)
            self.assertEqual(len(result), 1)
            self.assertEqual(result["0"]["item"]["test1"], "this is a test for the query on the index")
        finally:
            index.delete()