Ejemplo n.º 1
0
    def test_get_rated_items(self):
        ratings = pd.DataFrame({'to_id': ['tt0112281', 'tt0113497']})
        loaded_items = get_rated_items(movies_dir, ratings)

        result_loaded_ids = [item.content_id for item in loaded_items]

        self.assertIn('tt0112281', result_loaded_ids)
        self.assertIn('tt0113497', result_loaded_ids)
    def process_rated(self, user_ratings: pd.DataFrame, items_directory: str):
        """
        Function that extracts features from rated item and labels them.
        The extracted features will be later used to fit the classifier.

        Features and labels will be stored in private attributes of the class.

        IF there are no rated_items available locally or if there are only positive/negative
        items, an exception is thrown.

        Args:
            user_ratings (pd.DataFrame): DataFrame containing ratings of a single user
            items_directory (str): path of the directory where the items are stored
        """
        # Load rated items from the path
        rated_items = get_rated_items(items_directory, user_ratings)

        threshold = self.threshold
        if threshold is None:
            threshold = self._calc_mean_user_threshold(user_ratings)

        # Assign label and extract features from the rated items
        labels = []
        rated_dict = {}

        recsys_logger.info("Processing rated items")
        for item in rated_items:
            if item is not None:
                rated_dict[item] = self.extract_features_item(item)

                # This conversion raises Exception when there are multiple same to_id for the user
                score_assigned = float(user_ratings[user_ratings['to_id'] == item.content_id].score)
                if score_assigned >= threshold:
                    labels.append(1)
                else:
                    labels.append(0)

        if user_ratings.empty:
            raise EmptyUserRatings("The user selected doesn't have any ratings!")

        user_id = user_ratings.from_id.iloc[0]
        if len(rated_dict) == 0:
            raise NoRatedItems("User {} - No rated item available locally!".format(user_id))
        if 0 not in labels:
            raise OnlyPositiveItems("User {} - There are only positive items available locally!".format(user_id))
        elif 1 not in labels:
            raise OnlyNegativeItems("User {} - There are only negative items available locally!".format(user_id))

        self.__labels = labels
        self.__rated_dict = rated_dict
Ejemplo n.º 3
0
    def preprocessing(self, items_directory: str, ratings: pd.DataFrame, candidate_item_id_list: list = None):
        """
        Function used to retrieve data that will be used in the computation of the ranking.
        It loads the rated and unrated items, computes the threshold if it was set to -1 and
        extracts the features from the unrated items.

        This method can throw two exceptions. The first one is thrown if the threshold value specified
        in the constructor of the class it's not in the range [-1, 1], the second one is thrown if,
        while considering a candidate_item_id_list passed as an argument, there are no valid
        items to consider (example: ['test', 'test2'] but neither test nor test2 are items in the
        items directory)

        Args:
            items_directory (str): directory where the items are stored
            ratings (Dataframe): dataframe which contains ratings given by the user
            candidate_item_id_list (list): list of the items that can be recommended, if None
            all unrated items will be used

        Returns:
            rated_items (list): list containing the instances of the rated items
            unrated_items (list): list containing the instances of the unrated items
            unrated_features_baglist (list): list containing the features extracted from the unrated items
        """

        # If threshold is the min possible (range is [-1, 1]), we calculate the mean value
        # of all the ratings and set it as the threshold. Also an exception is thrown if the
        # threshold value is not in the range
        if not -1 <= self.__threshold <= 1:
            raise ValueError("Threshold value must be in the range [-1, 1]")

        if self.__threshold == -1:
            self.__threshold = pd.to_numeric(ratings["score"], downcast="float").mean()

        # Load unrated items from the path
        if candidate_item_id_list is None or len(candidate_item_id_list) == 0:
            unrated_items = get_unrated_items(items_directory, ratings)
        else:
            # If a candidate list is specified, it loads only items that are valid (it doesn't add None to the list)
            unrated_items = [load_content_instance(items_directory, item_id) for item_id in candidate_item_id_list
                             if load_content_instance(items_directory, item_id) is not None]

        if len(unrated_items) == 0:
            raise ValueError("No valid unrated items found")

        # Load rated items from the path
        rated_items = get_rated_items(items_directory, ratings)

        return rated_items, unrated_items, self.__calc_unrated_baglist(unrated_items)
Ejemplo n.º 4
0
    def process_rated(self, user_ratings: pd.DataFrame, items_directory: str):
        """
        Function that extracts features from positive rated items ONLY!
        The extracted features will be used to fit the algorithm (build the query).

        Features extracted will be stored in private attributes of the class.

        Args:
            user_ratings (pd.DataFrame): DataFrame containing ratings of a single user
            items_directory (str): path of the directory where the items are stored
        """
        # Load rated items from the path
        rated_items = get_rated_items(items_directory, user_ratings)

        recsys_logger.info("Processing rated items")
        # If threshold wasn't passed in the constructor, then we take the mean rating
        # given by the user as its threshold
        threshold = self.threshold
        if threshold is None:
            threshold = self._calc_mean_user_threshold(user_ratings)

        # Calculates labels and extract features from the positive rated items
        positive_rated_dict = {}
        for item in rated_items:
            score_assigned = float(
                user_ratings[user_ratings['to_id'] == item.content_id].score)
            if item is not None and score_assigned >= threshold:

                positive_rated_dict[item] = self.extract_features_item(item)

        if user_ratings.empty:
            raise EmptyUserRatings(
                "The user selected doesn't have any ratings!")

        user_id = user_ratings.from_id.iloc[0]
        if len(rated_items) == 0 or all(rated_items) is None:
            raise NoRatedItems(
                "User {} - No rated items available locally!".format(user_id))
        if len(positive_rated_dict) == 0:
            raise OnlyNegativeItems(
                "User {} - There are only negative items available locally!")

        self.__positive_rated_dict = positive_rated_dict
Ejemplo n.º 5
0
    def predict(self, user_id: str, ratings: pd.DataFrame, recs_number: int, items_directory: str, candidate_item_id_list: List = None) -> pd.DataFrame:
        """
        1) Goes into items_directory and for each item takes the values corresponding to the field_representation of
        the item_field. For example, if item_field == "Plot" and field_representation == "tf-idf", the function will
        take the "tf-idf" representation of each  "Plot" field for every rated item, the tf-idf representation of rated items
        and items to classify will be parsed to dense arrays;
        2) Define target features, items with rating greater (lower) than threshold will be used as positive(negative) examples;
        3) Creates an object Classifier, uses the method fit and predicts the class of the new items

        Args:
            candidate_item_id_list: list of the items that can be recommended, if None
                all unrated items will be used
            user_id: user for which recommendations will be computed
            recs_number (list[Content]): How long the ranking will be
            ratings (pd.DataFrame): ratings of the user with id equal to user_id
            items_directory (str): Name of the directory where the items are stored.

        Returns:
            The predicted classes, or the predict values.
        """

        if candidate_item_id_list is None:
            unrated_items = get_unrated_items(items_directory, ratings)
        else:
            unrated_items = [load_content_instance(items_directory, item_id) for item_id in candidate_item_id_list]

        rated_features_bag_list = []
        unrated_features_bag_list = []

        logger.info("Retrieving rated items")
        rated_items = get_rated_items(items_directory, ratings)
        if self.__threshold == -1:
            threshold = pd.to_numeric(ratings["score"], downcast="float").mean()
        else:
            threshold = self.__threshold

        labels = []
        for item in rated_items:
            if item is not None:
                rated_features_bag_list.append(item.get_field(self.get_item_field()).get_representation(self.get_item_field_representation()).get_value())
                labels.append(1 if float(ratings[ratings['to_id'] == item.get_content_id()].score) >= threshold else 0)

        logger.info("Labeling examples")
        for item in unrated_items:
            if item is not None:
                unrated_features_bag_list.append(item.get_field(self.get_item_field()).get_representation(self.get_item_field_representation()).get_value())

        clf = None
        if self.__classifier.lower() == "random_forest":
            clf = RandomForestClassifier(n_estimators=400, random_state=42)
        elif self.__classifier.lower() == "svm":
            clf = CalibratedClassifierCV(LinearSVC(random_state=42))
        elif self.__classifier.lower() == "log_regr":
            clf = LogisticRegression(random_state=42)
        elif self.__classifier.lower() == "knn":
            clf = neighbors.KNeighborsClassifier()
        elif self.__classifier.lower() == "decision_tree":
            clf = DecisionTreeClassifier(random_state=42)
        elif self.__classifier.lower() == "gaussian_process":
            clf = GaussianProcessClassifier(random_state=42)

        logger.info("Fitting classifier")
        if self.__classifier.lower() == "gaussian_process":
            pipe = make_pipeline(DictVectorizer(sparse=True), FunctionTransformer(lambda x: x.todense(), accept_sparse=True), clf)
        else:
            pipe = make_pipeline(DictVectorizer(sparse=True), clf)

        pipe = pipe.fit(rated_features_bag_list, labels)

        columns = ["to_id", "rating"]
        score_frame = pd.DataFrame(columns=columns)

        logger.info("Predicting scores")
        score_labels = pipe.predict_proba(unrated_features_bag_list)

        for score, item in zip(score_labels, unrated_items):
            if item is not None:
                score_frame = pd.concat([score_frame, pd.DataFrame.from_records([(item.get_content_id(), score[1])], columns=columns)], ignore_index=True)

        score_frame = score_frame.sort_values(['rating'], ascending=False).reset_index(drop=True)
        score_frame = score_frame[:recs_number]

        return score_frame
Ejemplo n.º 6
0
    def predict(self,
                user_id: str,
                ratings: pd.DataFrame,
                recs_number: int,
                items_directory: str,
                candidate_item_id_list: List = None) -> pd.DataFrame:
        """
        Checks:
        1) Checks if the representation corresponding to field_representation exists
        2) Checks if the field representation is a document embedding (whose shape equals 1)

        Example: item_field == "Plot" and field_representation == "1", the function will check if the "01"
        representation of each "Plot" field is a document embedding or a tf-idf words bag, and then use the embedding
        or the frequency vector for algorithm computation.

        Computes the centroid of the positive rated items representations

        For each candidate item:
        1) Takes the embedding arrays
        2) Determines the similarity between the centroid and the field_representation of the item_field in candidate item.

        Args:
            candidate_item_id_list: list of the items that can be recommended, if None
                all unrated items will be used
            user_id: user for which recommendations will be computed
            recs_number (list[Content]): How long the ranking will be
            ratings (pd.DataFrame): ratings of the user with id equal to user_id
            items_directory (str): Name of the directory where the items are stored.

        Returns:
             scores (pd.DataFrame): DataFrame whose columns are the ids of the items (to_id), and the similarities between the
                  items and the centroid (rating)
        """

        # try:
        logger.info("Retrieving candidate items")
        if candidate_item_id_list is None:
            unrated_items = get_unrated_items(items_directory, ratings)
        else:
            unrated_items = [
                load_content_instance(items_directory, item_id)
                for item_id in candidate_item_id_list
            ]

        logger.info("Retrieving rated items")
        rated_items = get_rated_items(items_directory, ratings)
        if len(rated_items) == 0:
            columns = ["to_id", "rating"]
            scores = pd.DataFrame(columns=columns)
            return scores
        first_item = rated_items[0]
        need_vectorizer = False
        if self.item_field not in first_item.field_dict:
            raise ValueError("The field name specified could not be found!")
        else:
            try:
                representation = first_item.get_field(
                    self.item_field).get_representation(
                        self.item_field_representation)
            except KeyError:
                raise ValueError(
                    "The given representation id wasn't found for the specified field"
                )

            if not isinstance(representation,
                              EmbeddingField) and not isinstance(
                                  representation, FeaturesBagField):
                raise ValueError(
                    "The given representation must be an embedding or a tf-idf vector"
                )

            if isinstance(representation, EmbeddingField):
                if len(representation.value.shape) != 1:
                    raise ValueError(
                        "The specified representation is not a document embedding, so the centroid"
                        " can not be calculated")

            if isinstance(representation, FeaturesBagField):
                need_vectorizer = True

        columns = ["to_id", "rating"]
        scores = pd.DataFrame(columns=columns)

        if not need_vectorizer:
            logger.info("Computing centroid")
            centroid = self.__get_centroid_without_vectorizer(
                ratings, rated_items)
            logger.info("Computing similarities")

            for item in unrated_items:
                item_id = item.content_id
                item_field_representation = item.get_field(
                    self.item_field).get_representation(
                        self.item_field_representation).value
                logger.info("Computing similarity with %s" % item_id)
                similarity = self.__similarity.perform(
                    DenseVector(centroid),
                    DenseVector(item_field_representation))
                scores = pd.concat([
                    scores,
                    pd.DataFrame.from_records([(item_id, similarity)],
                                              columns=columns)
                ],
                                   ignore_index=True)
        else:
            logger.info("Computing centroid")
            centroid, unrated_matrix = self.__get_centroid_with_vectorizer(
                ratings, rated_items, unrated_items)

            logger.info("Computing similarities")

            a = []
            for x in unrated_items:
                if x is not None:
                    a.append(x)
            unrated_items = a

            for item, item_array in zip(unrated_items, unrated_matrix):
                item_id = item.content_id
                logger.info("Computing similarity with %s" % item_id)
                similarity = self.__similarity.perform(
                    SparseVector(centroid), SparseVector(item_array))
                scores = pd.concat([
                    scores,
                    pd.DataFrame.from_records([(item_id, similarity)],
                                              columns=columns)
                ],
                                   ignore_index=True)

        scores = scores.sort_values(['rating'],
                                    ascending=False).reset_index(drop=True)
        scores = scores[:recs_number]

        return scores
 def test_get_rated_items(self):
     ratings = pd.DataFrame({'to_id': ['tt0112281', 'tt0113497']})
     get_rated_items(os.path.join(contents_path, 'movies_codified'),
                     ratings)