def test_load_content_instance(self): try: content = load_content_instance(contents_path, "tt0112281") self.assertEqual(content.content_id, "tt0112281") content = load_content_instance("aaa", "1") self.assertEqual(content, None) except FileNotFoundError: self.fail("File not found!")
def preprocessing(self, items_directory: str, ratings: pd.DataFrame, candidate_item_id_list: list = None): """ Function used to retrieve data that will be used in the computation of the ranking. It loads the rated and unrated items, computes the threshold if it was set to -1 and extracts the features from the unrated items. This method can throw two exceptions. The first one is thrown if the threshold value specified in the constructor of the class it's not in the range [-1, 1], the second one is thrown if, while considering a candidate_item_id_list passed as an argument, there are no valid items to consider (example: ['test', 'test2'] but neither test nor test2 are items in the items directory) Args: items_directory (str): directory where the items are stored ratings (Dataframe): dataframe which contains ratings given by the user candidate_item_id_list (list): list of the items that can be recommended, if None all unrated items will be used Returns: rated_items (list): list containing the instances of the rated items unrated_items (list): list containing the instances of the unrated items unrated_features_baglist (list): list containing the features extracted from the unrated items """ # If threshold is the min possible (range is [-1, 1]), we calculate the mean value # of all the ratings and set it as the threshold. Also an exception is thrown if the # threshold value is not in the range if not -1 <= self.__threshold <= 1: raise ValueError("Threshold value must be in the range [-1, 1]") if self.__threshold == -1: self.__threshold = pd.to_numeric(ratings["score"], downcast="float").mean() # Load unrated items from the path if candidate_item_id_list is None or len(candidate_item_id_list) == 0: unrated_items = get_unrated_items(items_directory, ratings) else: # If a candidate list is specified, it loads only items that are valid (it doesn't add None to the list) unrated_items = [load_content_instance(items_directory, item_id) for item_id in candidate_item_id_list if load_content_instance(items_directory, item_id) is not None] if len(unrated_items) == 0: raise ValueError("No valid unrated items found") # Load rated items from the path rated_items = get_rated_items(items_directory, ratings) return rated_items, unrated_items, self.__calc_unrated_baglist(unrated_items)
def fit_eval_predict(self, user_id, user_ratings: pd.DataFrame, test_set: pd.DataFrame): """ Computes predicted ratings, or ranking (according to algorithm chosen in the config) user ratings will be used as train set to fit the algorithm. If the algorithm is score_prediction the rating for the item in the test set will be predicted Args: user_id: user for which predictions will be computed user_ratings: train set test_set: Returns: score_frame (DataFrame): result frame whose columns are: to_id, rating """ logger.info("Loading items") item_to_predict_id_list = [item for item in test_set.to_id ] # unrated items list items = [ load_content_instance(self.__config.items_directory, re.sub(r'[^\w\s]', '', item_id)) for item_id in item_to_predict_id_list ] logger.info("Loaded %d items" % len(items)) # calculate predictions logger.info("Computing predictions") score_frame = self.__config.score_prediction_algorithm.predict( user_id, items, user_ratings, self.__config.items_directory) return score_frame
def predict(self, user_id: str, ratings: pd.DataFrame, recs_number, items_directory: str, candidate_item_id_list: List = None): """ Finds the documents that the user liked and then calls __recs_query to execute the prediction Args: candidate_item_id_list: list of the items that can be recommended, if None all unrated items will be used user_id: user for which recommendations will be computed recs_number (list[Content]): How long the ranking will be ratings (pd.DataFrame): ratings of the user with id equal to user_id items_directory (str): Name of the directory where the items are stored. Returns: (pd.DataFrame) """ index_path = os.path.join(items_directory, 'search_index') if not DEVELOPING: index_path = os.path.join(home_path, items_directory, 'search_index') scores = [] rated_document_list = [] for item_id, score in zip(ratings.to_id, ratings.score): item = load_content_instance(items_directory, item_id) if score > self.__positive_threshold: rated_document_list.append(item.index_document_id) scores.append(score) return self.__recs_query(rated_document_list, scores, recs_number, index_path, candidate_item_id_list)
def _add_usr_properties(self, row): """ Private method that given a row containing the 'from_id' field, tries to load the content from the user_contents_dir and if succeeds, extract properties presents in the content loaded based on the 'user_exo_representation' and 'user_exo_properties' parameters passed in the constructor as such: 'user_exo_representation' was passed, 'user_exo_properties' was passed: ---------> Extract from the representation passed, the properties passed EXAMPLE: user_exo_representation = 0 user_exo_properties = ['gender', 'birthdate'] will extract the 'gender' and 'birthdate' property from the representation '0' 'user_exo_representation' was passed, 'user_exo_properties' was NOT passed: ---------> Extract from the representation passed, ALL properties present in said representation EXAMPLE: user_exo_representation = 0 will extract ALL properties from the representation '0' 'user_exo_representation' was NOT passed, 'user_exo_properties' was passed: ---------> Extract from ALL representations, the properties passed EXAMPLE: user_exo_properties = ['gender', 'birthdate'] will extract the 'gender' and 'birthdate' property from ALL exogenous representations of the content Args: row (dict): dict-like parameter containing at least a 'from_id' field """ content = load_content_instance(self.__user_contents_dir, row['from_id']) if content is not None: # Provided representation and properties if self.get_user_exogenous_representation() is not None and \ self.get_user_exogenous_properties() is not None: self._prop_by_rep(content, UserNode(row['from_id']), self.get_user_exogenous_representation(), self.get_user_exogenous_properties(), row) # Provided only the representation elif self.get_user_exogenous_representation() is not None and \ self.get_user_exogenous_properties() is None: self._all_prop_in_rep(content, UserNode(row['from_id']), self.get_user_exogenous_representation(), row) # Provided only the properties elif self.get_user_exogenous_representation() is None and \ self.get_user_exogenous_properties() is not None: self._prop_in_all_rep(content, UserNode(row['from_id']), self.get_user_exogenous_properties(), row)
def test_extract_features_item(self): movies_dir = os.path.join(contents_path, 'movies_codified/') content = load_content_instance(movies_dir, 'tt0112281') result = self.alg.extract_features_item(content) self.assertEqual(1, len(result)) self.assertIsInstance(result[0], dict)
def __get_item_list(self, item_to_predict_id_list, user_ratings): if item_to_predict_id_list is None: # all items without rating if the list is not set item_to_predict_list = get_unrated_items( self.__config.items_directory, user_ratings) else: item_to_predict_list = [ load_content_instance(self.__config.items_directory, re.sub(r'[^\w\s]', '', item_id)) for item_id in item_to_predict_id_list ] return item_to_predict_list
def predict(self, ratings: pd.DataFrame, recs_number: int, items_directory: str, candidate_item_id_list: list = None): """ Finds the documents that the user liked by comparing the score given by the user to the item against the positive_threshold of the index_query object (if the rating is greater than the threshold, the document it refers to is considered liked by the user) After that, calls __recs_query to execute the prediction Args: ratings (pd.DataFrame): ratings of the user with id equal to user_id recs_number (int): how long the ranking will be items_directory (str): name of the directory where the items are stored candidate_item_id_list (list): list of the items that can be recommended, if None all unrated items will be used Returns: (pd.DataFrame) dataframe that for each row has a suggested item id and a rating of said item. This rating represents how much the item matches the query used for retrieving the recommendation list EXAMPLES: Find a recommendation list with two items for a user: predict('A000', ratings, 2, '../../example') Find a recommendation list with one item for a user considering a candidate list containing two items: predict('A000', ratings, 1, '../../example', ['tt0114885', 'tt0114388']) Ratings is a variable containing a dataframe with the user ratings Ratings dataframe columns example: "from_id", "to_id", "original_rating", "score", "timestamp" """ index_path = os.path.join(items_directory, 'search_index') if not DEVELOPING: index_path = os.path.join(home_path, items_directory, 'search_index') scores = [] positive_rated_document_list = [] for item_id, score in zip(ratings.to_id, ratings.score): if score > self.__positive_threshold: item = load_content_instance(items_directory, item_id) if item is not None: positive_rated_document_list.append(item.index_document_id) scores.append(score) try: return self.__recs_query(positive_rated_document_list, ratings.to_id, scores, recs_number, index_path, candidate_item_id_list) except (ValueError, EmptyIndexError) as e: logger.warning(str(e)) columns = ["to_id", "rating"] score_frame = pd.DataFrame(columns=columns) return score_frame
def test_load_content_instance(self): try: load_content_instance("aaa", '1') except FileNotFoundError: pass
def predict(self, user_id: str, ratings: pd.DataFrame, recs_number: int, items_directory: str, candidate_item_id_list: List = None) -> pd.DataFrame: """ 1) Goes into items_directory and for each item takes the values corresponding to the field_representation of the item_field. For example, if item_field == "Plot" and field_representation == "tf-idf", the function will take the "tf-idf" representation of each "Plot" field for every rated item, the tf-idf representation of rated items and items to classify will be parsed to dense arrays; 2) Define target features, items with rating greater (lower) than threshold will be used as positive(negative) examples; 3) Creates an object Classifier, uses the method fit and predicts the class of the new items Args: candidate_item_id_list: list of the items that can be recommended, if None all unrated items will be used user_id: user for which recommendations will be computed recs_number (list[Content]): How long the ranking will be ratings (pd.DataFrame): ratings of the user with id equal to user_id items_directory (str): Name of the directory where the items are stored. Returns: The predicted classes, or the predict values. """ if candidate_item_id_list is None: unrated_items = get_unrated_items(items_directory, ratings) else: unrated_items = [load_content_instance(items_directory, item_id) for item_id in candidate_item_id_list] rated_features_bag_list = [] unrated_features_bag_list = [] logger.info("Retrieving rated items") rated_items = get_rated_items(items_directory, ratings) if self.__threshold == -1: threshold = pd.to_numeric(ratings["score"], downcast="float").mean() else: threshold = self.__threshold labels = [] for item in rated_items: if item is not None: rated_features_bag_list.append(item.get_field(self.get_item_field()).get_representation(self.get_item_field_representation()).get_value()) labels.append(1 if float(ratings[ratings['to_id'] == item.get_content_id()].score) >= threshold else 0) logger.info("Labeling examples") for item in unrated_items: if item is not None: unrated_features_bag_list.append(item.get_field(self.get_item_field()).get_representation(self.get_item_field_representation()).get_value()) clf = None if self.__classifier.lower() == "random_forest": clf = RandomForestClassifier(n_estimators=400, random_state=42) elif self.__classifier.lower() == "svm": clf = CalibratedClassifierCV(LinearSVC(random_state=42)) elif self.__classifier.lower() == "log_regr": clf = LogisticRegression(random_state=42) elif self.__classifier.lower() == "knn": clf = neighbors.KNeighborsClassifier() elif self.__classifier.lower() == "decision_tree": clf = DecisionTreeClassifier(random_state=42) elif self.__classifier.lower() == "gaussian_process": clf = GaussianProcessClassifier(random_state=42) logger.info("Fitting classifier") if self.__classifier.lower() == "gaussian_process": pipe = make_pipeline(DictVectorizer(sparse=True), FunctionTransformer(lambda x: x.todense(), accept_sparse=True), clf) else: pipe = make_pipeline(DictVectorizer(sparse=True), clf) pipe = pipe.fit(rated_features_bag_list, labels) columns = ["to_id", "rating"] score_frame = pd.DataFrame(columns=columns) logger.info("Predicting scores") score_labels = pipe.predict_proba(unrated_features_bag_list) for score, item in zip(score_labels, unrated_items): if item is not None: score_frame = pd.concat([score_frame, pd.DataFrame.from_records([(item.get_content_id(), score[1])], columns=columns)], ignore_index=True) score_frame = score_frame.sort_values(['rating'], ascending=False).reset_index(drop=True) score_frame = score_frame[:recs_number] return score_frame
def predict(self, user_id: str, ratings: pd.DataFrame, recs_number: int, items_directory: str, candidate_item_id_list: List = None) -> pd.DataFrame: """ Checks: 1) Checks if the representation corresponding to field_representation exists 2) Checks if the field representation is a document embedding (whose shape equals 1) Example: item_field == "Plot" and field_representation == "1", the function will check if the "01" representation of each "Plot" field is a document embedding or a tf-idf words bag, and then use the embedding or the frequency vector for algorithm computation. Computes the centroid of the positive rated items representations For each candidate item: 1) Takes the embedding arrays 2) Determines the similarity between the centroid and the field_representation of the item_field in candidate item. Args: candidate_item_id_list: list of the items that can be recommended, if None all unrated items will be used user_id: user for which recommendations will be computed recs_number (list[Content]): How long the ranking will be ratings (pd.DataFrame): ratings of the user with id equal to user_id items_directory (str): Name of the directory where the items are stored. Returns: scores (pd.DataFrame): DataFrame whose columns are the ids of the items (to_id), and the similarities between the items and the centroid (rating) """ # try: logger.info("Retrieving candidate items") if candidate_item_id_list is None: unrated_items = get_unrated_items(items_directory, ratings) else: unrated_items = [ load_content_instance(items_directory, item_id) for item_id in candidate_item_id_list ] logger.info("Retrieving rated items") rated_items = get_rated_items(items_directory, ratings) if len(rated_items) == 0: columns = ["to_id", "rating"] scores = pd.DataFrame(columns=columns) return scores first_item = rated_items[0] need_vectorizer = False if self.item_field not in first_item.field_dict: raise ValueError("The field name specified could not be found!") else: try: representation = first_item.get_field( self.item_field).get_representation( self.item_field_representation) except KeyError: raise ValueError( "The given representation id wasn't found for the specified field" ) if not isinstance(representation, EmbeddingField) and not isinstance( representation, FeaturesBagField): raise ValueError( "The given representation must be an embedding or a tf-idf vector" ) if isinstance(representation, EmbeddingField): if len(representation.value.shape) != 1: raise ValueError( "The specified representation is not a document embedding, so the centroid" " can not be calculated") if isinstance(representation, FeaturesBagField): need_vectorizer = True columns = ["to_id", "rating"] scores = pd.DataFrame(columns=columns) if not need_vectorizer: logger.info("Computing centroid") centroid = self.__get_centroid_without_vectorizer( ratings, rated_items) logger.info("Computing similarities") for item in unrated_items: item_id = item.content_id item_field_representation = item.get_field( self.item_field).get_representation( self.item_field_representation).value logger.info("Computing similarity with %s" % item_id) similarity = self.__similarity.perform( DenseVector(centroid), DenseVector(item_field_representation)) scores = pd.concat([ scores, pd.DataFrame.from_records([(item_id, similarity)], columns=columns) ], ignore_index=True) else: logger.info("Computing centroid") centroid, unrated_matrix = self.__get_centroid_with_vectorizer( ratings, rated_items, unrated_items) logger.info("Computing similarities") a = [] for x in unrated_items: if x is not None: a.append(x) unrated_items = a for item, item_array in zip(unrated_items, unrated_matrix): item_id = item.content_id logger.info("Computing similarity with %s" % item_id) similarity = self.__similarity.perform( SparseVector(centroid), SparseVector(item_array)) scores = pd.concat([ scores, pd.DataFrame.from_records([(item_id, similarity)], columns=columns) ], ignore_index=True) scores = scores.sort_values(['rating'], ascending=False).reset_index(drop=True) scores = scores[:recs_number] return scores
def test_load_content_instance(self): self.assertIsNone(load_content_instance("not_existent", "invalid_item"))