Exemple #1
0
    def test_load_content_instance(self):
        try:
            content = load_content_instance(contents_path, "tt0112281")
            self.assertEqual(content.content_id, "tt0112281")
            content = load_content_instance("aaa", "1")
            self.assertEqual(content, None)

        except FileNotFoundError:
            self.fail("File not found!")
Exemple #2
0
    def preprocessing(self, items_directory: str, ratings: pd.DataFrame, candidate_item_id_list: list = None):
        """
        Function used to retrieve data that will be used in the computation of the ranking.
        It loads the rated and unrated items, computes the threshold if it was set to -1 and
        extracts the features from the unrated items.

        This method can throw two exceptions. The first one is thrown if the threshold value specified
        in the constructor of the class it's not in the range [-1, 1], the second one is thrown if,
        while considering a candidate_item_id_list passed as an argument, there are no valid
        items to consider (example: ['test', 'test2'] but neither test nor test2 are items in the
        items directory)

        Args:
            items_directory (str): directory where the items are stored
            ratings (Dataframe): dataframe which contains ratings given by the user
            candidate_item_id_list (list): list of the items that can be recommended, if None
            all unrated items will be used

        Returns:
            rated_items (list): list containing the instances of the rated items
            unrated_items (list): list containing the instances of the unrated items
            unrated_features_baglist (list): list containing the features extracted from the unrated items
        """

        # If threshold is the min possible (range is [-1, 1]), we calculate the mean value
        # of all the ratings and set it as the threshold. Also an exception is thrown if the
        # threshold value is not in the range
        if not -1 <= self.__threshold <= 1:
            raise ValueError("Threshold value must be in the range [-1, 1]")

        if self.__threshold == -1:
            self.__threshold = pd.to_numeric(ratings["score"], downcast="float").mean()

        # Load unrated items from the path
        if candidate_item_id_list is None or len(candidate_item_id_list) == 0:
            unrated_items = get_unrated_items(items_directory, ratings)
        else:
            # If a candidate list is specified, it loads only items that are valid (it doesn't add None to the list)
            unrated_items = [load_content_instance(items_directory, item_id) for item_id in candidate_item_id_list
                             if load_content_instance(items_directory, item_id) is not None]

        if len(unrated_items) == 0:
            raise ValueError("No valid unrated items found")

        # Load rated items from the path
        rated_items = get_rated_items(items_directory, ratings)

        return rated_items, unrated_items, self.__calc_unrated_baglist(unrated_items)
Exemple #3
0
    def fit_eval_predict(self, user_id, user_ratings: pd.DataFrame,
                         test_set: pd.DataFrame):
        """
        Computes predicted ratings, or ranking (according to algorithm chosen in the config)
        user ratings will be used as train set to fit the algorithm.
        If the algorithm is score_prediction the rating for the item in the test set will
        be predicted

        Args:
            user_id: user for which predictions will be computed
            user_ratings: train set
            test_set:
        Returns:
            score_frame (DataFrame): result frame whose columns are: to_id, rating
        """
        logger.info("Loading items")
        item_to_predict_id_list = [item for item in test_set.to_id
                                   ]  # unrated items list
        items = [
            load_content_instance(self.__config.items_directory,
                                  re.sub(r'[^\w\s]', '', item_id))
            for item_id in item_to_predict_id_list
        ]

        logger.info("Loaded %d items" % len(items))

        # calculate predictions
        logger.info("Computing predictions")
        score_frame = self.__config.score_prediction_algorithm.predict(
            user_id, items, user_ratings, self.__config.items_directory)

        return score_frame
    def predict(self,
                user_id: str,
                ratings: pd.DataFrame,
                recs_number,
                items_directory: str,
                candidate_item_id_list: List = None):
        """
        Finds the documents that the user liked and then calls __recs_query to execute the prediction
        Args:
            candidate_item_id_list: list of the items that can be recommended, if None
            all unrated items will be used
            user_id: user for which recommendations will be computed
            recs_number (list[Content]): How long the ranking will be
            ratings (pd.DataFrame): ratings of the user with id equal to user_id
            items_directory (str): Name of the directory where the items are stored.
        Returns:
            (pd.DataFrame)
        """
        index_path = os.path.join(items_directory, 'search_index')
        if not DEVELOPING:
            index_path = os.path.join(home_path, items_directory,
                                      'search_index')

        scores = []
        rated_document_list = []
        for item_id, score in zip(ratings.to_id, ratings.score):
            item = load_content_instance(items_directory, item_id)

            if score > self.__positive_threshold:
                rated_document_list.append(item.index_document_id)
                scores.append(score)

        return self.__recs_query(rated_document_list, scores, recs_number,
                                 index_path, candidate_item_id_list)
Exemple #5
0
    def _add_usr_properties(self, row):
        """
        Private method that given a row containing the 'from_id' field, tries to load the content
        from the user_contents_dir and if succeeds, extract properties presents in the content loaded
        based on the 'user_exo_representation' and 'user_exo_properties' parameters passed in
        the constructor as such:

        'user_exo_representation' was passed, 'user_exo_properties' was passed:
        ---------> Extract from the representation passed, the properties passed
            EXAMPLE:
                user_exo_representation = 0
                user_exo_properties = ['gender', 'birthdate']

                will extract the 'gender' and 'birthdate' property from the representation '0'

        'user_exo_representation' was passed, 'user_exo_properties' was NOT passed:
        ---------> Extract from the representation passed, ALL properties present in said representation
            EXAMPLE:
                    user_exo_representation = 0

                    will extract ALL properties from the representation '0'

        'user_exo_representation' was NOT passed, 'user_exo_properties' was passed:
        ---------> Extract from ALL representations, the properties passed
            EXAMPLE:
                user_exo_properties = ['gender', 'birthdate']

                will extract the 'gender' and 'birthdate' property from ALL exogenous representations
                of the content

        Args:
            row (dict): dict-like parameter containing at least a 'from_id' field
        """
        content = load_content_instance(self.__user_contents_dir,
                                        row['from_id'])

        if content is not None:
            # Provided representation and properties
            if self.get_user_exogenous_representation() is not None and \
                    self.get_user_exogenous_properties() is not None:
                self._prop_by_rep(content, UserNode(row['from_id']),
                                  self.get_user_exogenous_representation(),
                                  self.get_user_exogenous_properties(), row)

            # Provided only the representation
            elif self.get_user_exogenous_representation() is not None and \
                    self.get_user_exogenous_properties() is None:
                self._all_prop_in_rep(content, UserNode(row['from_id']),
                                      self.get_user_exogenous_representation(),
                                      row)

            # Provided only the properties
            elif self.get_user_exogenous_representation() is None and \
                    self.get_user_exogenous_properties() is not None:
                self._prop_in_all_rep(content, UserNode(row['from_id']),
                                      self.get_user_exogenous_properties(),
                                      row)
    def test_extract_features_item(self):
        movies_dir = os.path.join(contents_path, 'movies_codified/')

        content = load_content_instance(movies_dir, 'tt0112281')

        result = self.alg.extract_features_item(content)

        self.assertEqual(1, len(result))
        self.assertIsInstance(result[0], dict)
Exemple #7
0
    def __get_item_list(self, item_to_predict_id_list, user_ratings):
        if item_to_predict_id_list is None:
            # all items without rating if the list is not set
            item_to_predict_list = get_unrated_items(
                self.__config.items_directory, user_ratings)
        else:
            item_to_predict_list = [
                load_content_instance(self.__config.items_directory,
                                      re.sub(r'[^\w\s]', '', item_id))
                for item_id in item_to_predict_id_list
            ]

        return item_to_predict_list
Exemple #8
0
    def predict(self, ratings: pd.DataFrame, recs_number: int, items_directory: str,
                candidate_item_id_list: list = None):
        """
        Finds the documents that the user liked by comparing the score given by the user to the item
        against the positive_threshold of the index_query object (if the rating is greater than the threshold,
        the document it refers to is considered liked by the user)
        After that, calls __recs_query to execute the prediction
        Args:
            ratings (pd.DataFrame): ratings of the user with id equal to user_id
            recs_number (int): how long the ranking will be
            items_directory (str): name of the directory where the items are stored
            candidate_item_id_list (list): list of the items that can be recommended, if None
                all unrated items will be used
        Returns:
            (pd.DataFrame) dataframe that for each row has a suggested item id and a rating of
                said item. This rating represents how much the item matches the query used for
                retrieving the recommendation list
        EXAMPLES:
            Find a recommendation list with two items for a user:
                predict('A000', ratings, 2, '../../example')
            Find a recommendation list with one item for a user considering a candidate list containing two items:
                predict('A000', ratings, 1, '../../example', ['tt0114885', 'tt0114388'])
            Ratings is a variable containing a dataframe with the user ratings
            Ratings dataframe columns example: "from_id", "to_id", "original_rating", "score", "timestamp"
        """
        index_path = os.path.join(items_directory, 'search_index')
        if not DEVELOPING:
            index_path = os.path.join(home_path, items_directory, 'search_index')

        scores = []
        positive_rated_document_list = []
        for item_id, score in zip(ratings.to_id, ratings.score):
            if score > self.__positive_threshold:
                item = load_content_instance(items_directory, item_id)
                if item is not None:
                    positive_rated_document_list.append(item.index_document_id)
                    scores.append(score)

        try:
            return self.__recs_query(positive_rated_document_list,
                                     ratings.to_id,
                                     scores,
                                     recs_number,
                                     index_path,
                                     candidate_item_id_list)
        except (ValueError, EmptyIndexError) as e:
            logger.warning(str(e))
            columns = ["to_id", "rating"]
            score_frame = pd.DataFrame(columns=columns)
            return score_frame
 def test_load_content_instance(self):
     try:
         load_content_instance("aaa", '1')
     except FileNotFoundError:
         pass
    def predict(self, user_id: str, ratings: pd.DataFrame, recs_number: int, items_directory: str, candidate_item_id_list: List = None) -> pd.DataFrame:
        """
        1) Goes into items_directory and for each item takes the values corresponding to the field_representation of
        the item_field. For example, if item_field == "Plot" and field_representation == "tf-idf", the function will
        take the "tf-idf" representation of each  "Plot" field for every rated item, the tf-idf representation of rated items
        and items to classify will be parsed to dense arrays;
        2) Define target features, items with rating greater (lower) than threshold will be used as positive(negative) examples;
        3) Creates an object Classifier, uses the method fit and predicts the class of the new items

        Args:
            candidate_item_id_list: list of the items that can be recommended, if None
                all unrated items will be used
            user_id: user for which recommendations will be computed
            recs_number (list[Content]): How long the ranking will be
            ratings (pd.DataFrame): ratings of the user with id equal to user_id
            items_directory (str): Name of the directory where the items are stored.

        Returns:
            The predicted classes, or the predict values.
        """

        if candidate_item_id_list is None:
            unrated_items = get_unrated_items(items_directory, ratings)
        else:
            unrated_items = [load_content_instance(items_directory, item_id) for item_id in candidate_item_id_list]

        rated_features_bag_list = []
        unrated_features_bag_list = []

        logger.info("Retrieving rated items")
        rated_items = get_rated_items(items_directory, ratings)
        if self.__threshold == -1:
            threshold = pd.to_numeric(ratings["score"], downcast="float").mean()
        else:
            threshold = self.__threshold

        labels = []
        for item in rated_items:
            if item is not None:
                rated_features_bag_list.append(item.get_field(self.get_item_field()).get_representation(self.get_item_field_representation()).get_value())
                labels.append(1 if float(ratings[ratings['to_id'] == item.get_content_id()].score) >= threshold else 0)

        logger.info("Labeling examples")
        for item in unrated_items:
            if item is not None:
                unrated_features_bag_list.append(item.get_field(self.get_item_field()).get_representation(self.get_item_field_representation()).get_value())

        clf = None
        if self.__classifier.lower() == "random_forest":
            clf = RandomForestClassifier(n_estimators=400, random_state=42)
        elif self.__classifier.lower() == "svm":
            clf = CalibratedClassifierCV(LinearSVC(random_state=42))
        elif self.__classifier.lower() == "log_regr":
            clf = LogisticRegression(random_state=42)
        elif self.__classifier.lower() == "knn":
            clf = neighbors.KNeighborsClassifier()
        elif self.__classifier.lower() == "decision_tree":
            clf = DecisionTreeClassifier(random_state=42)
        elif self.__classifier.lower() == "gaussian_process":
            clf = GaussianProcessClassifier(random_state=42)

        logger.info("Fitting classifier")
        if self.__classifier.lower() == "gaussian_process":
            pipe = make_pipeline(DictVectorizer(sparse=True), FunctionTransformer(lambda x: x.todense(), accept_sparse=True), clf)
        else:
            pipe = make_pipeline(DictVectorizer(sparse=True), clf)

        pipe = pipe.fit(rated_features_bag_list, labels)

        columns = ["to_id", "rating"]
        score_frame = pd.DataFrame(columns=columns)

        logger.info("Predicting scores")
        score_labels = pipe.predict_proba(unrated_features_bag_list)

        for score, item in zip(score_labels, unrated_items):
            if item is not None:
                score_frame = pd.concat([score_frame, pd.DataFrame.from_records([(item.get_content_id(), score[1])], columns=columns)], ignore_index=True)

        score_frame = score_frame.sort_values(['rating'], ascending=False).reset_index(drop=True)
        score_frame = score_frame[:recs_number]

        return score_frame
    def predict(self,
                user_id: str,
                ratings: pd.DataFrame,
                recs_number: int,
                items_directory: str,
                candidate_item_id_list: List = None) -> pd.DataFrame:
        """
        Checks:
        1) Checks if the representation corresponding to field_representation exists
        2) Checks if the field representation is a document embedding (whose shape equals 1)

        Example: item_field == "Plot" and field_representation == "1", the function will check if the "01"
        representation of each "Plot" field is a document embedding or a tf-idf words bag, and then use the embedding
        or the frequency vector for algorithm computation.

        Computes the centroid of the positive rated items representations

        For each candidate item:
        1) Takes the embedding arrays
        2) Determines the similarity between the centroid and the field_representation of the item_field in candidate item.

        Args:
            candidate_item_id_list: list of the items that can be recommended, if None
                all unrated items will be used
            user_id: user for which recommendations will be computed
            recs_number (list[Content]): How long the ranking will be
            ratings (pd.DataFrame): ratings of the user with id equal to user_id
            items_directory (str): Name of the directory where the items are stored.

        Returns:
             scores (pd.DataFrame): DataFrame whose columns are the ids of the items (to_id), and the similarities between the
                  items and the centroid (rating)
        """

        # try:
        logger.info("Retrieving candidate items")
        if candidate_item_id_list is None:
            unrated_items = get_unrated_items(items_directory, ratings)
        else:
            unrated_items = [
                load_content_instance(items_directory, item_id)
                for item_id in candidate_item_id_list
            ]

        logger.info("Retrieving rated items")
        rated_items = get_rated_items(items_directory, ratings)
        if len(rated_items) == 0:
            columns = ["to_id", "rating"]
            scores = pd.DataFrame(columns=columns)
            return scores
        first_item = rated_items[0]
        need_vectorizer = False
        if self.item_field not in first_item.field_dict:
            raise ValueError("The field name specified could not be found!")
        else:
            try:
                representation = first_item.get_field(
                    self.item_field).get_representation(
                        self.item_field_representation)
            except KeyError:
                raise ValueError(
                    "The given representation id wasn't found for the specified field"
                )

            if not isinstance(representation,
                              EmbeddingField) and not isinstance(
                                  representation, FeaturesBagField):
                raise ValueError(
                    "The given representation must be an embedding or a tf-idf vector"
                )

            if isinstance(representation, EmbeddingField):
                if len(representation.value.shape) != 1:
                    raise ValueError(
                        "The specified representation is not a document embedding, so the centroid"
                        " can not be calculated")

            if isinstance(representation, FeaturesBagField):
                need_vectorizer = True

        columns = ["to_id", "rating"]
        scores = pd.DataFrame(columns=columns)

        if not need_vectorizer:
            logger.info("Computing centroid")
            centroid = self.__get_centroid_without_vectorizer(
                ratings, rated_items)
            logger.info("Computing similarities")

            for item in unrated_items:
                item_id = item.content_id
                item_field_representation = item.get_field(
                    self.item_field).get_representation(
                        self.item_field_representation).value
                logger.info("Computing similarity with %s" % item_id)
                similarity = self.__similarity.perform(
                    DenseVector(centroid),
                    DenseVector(item_field_representation))
                scores = pd.concat([
                    scores,
                    pd.DataFrame.from_records([(item_id, similarity)],
                                              columns=columns)
                ],
                                   ignore_index=True)
        else:
            logger.info("Computing centroid")
            centroid, unrated_matrix = self.__get_centroid_with_vectorizer(
                ratings, rated_items, unrated_items)

            logger.info("Computing similarities")

            a = []
            for x in unrated_items:
                if x is not None:
                    a.append(x)
            unrated_items = a

            for item, item_array in zip(unrated_items, unrated_matrix):
                item_id = item.content_id
                logger.info("Computing similarity with %s" % item_id)
                similarity = self.__similarity.perform(
                    SparseVector(centroid), SparseVector(item_array))
                scores = pd.concat([
                    scores,
                    pd.DataFrame.from_records([(item_id, similarity)],
                                              columns=columns)
                ],
                                   ignore_index=True)

        scores = scores.sort_values(['rating'],
                                    ascending=False).reset_index(drop=True)
        scores = scores[:recs_number]

        return scores
Exemple #12
0
 def test_load_content_instance(self):
     self.assertIsNone(load_content_instance("not_existent",
                                             "invalid_item"))