def add_link(self, start_node: object, final_node: object, weight: float = None, label: str = None): """ Creates a weighted link connecting the 'start_node' to the 'final_node' Both nodes must be present in the graph before calling this method 'weight' and 'label' are optional parameters, if not specified default values will be used. Args: start_node (object): starting node of the link final_node (object): ending node of the link weight (float): weight of the link, default is 0.5 label (str): label of the link, default is 'score_label' """ if label is None: label = self.get_default_score_label() if weight is None: weight = self.get_default_weight() if self.node_exists(start_node) and self.node_exists(final_node): self.__graph.add_edge(start_node, final_node, weight=weight, label=label) else: logger.warning( "One of the nodes or both don't exist in the graph! Add them before " "calling this method.")
def produce_content( self, field_name: str, preprocessor_list: List[InformationProcessor], source: RawInformationSource) -> List[FieldRepresentation]: representation_list: List[FieldRepresentation] = [] # if the embedding source is an EmbeddingLearner (meaning it can be trained) and the source has no model # the source is trained if isinstance( self.__embedding_source, EmbeddingLearner) and self.__embedding_source.model is None: logger.warning( "The model %s wasn't found, so it will be created and trained now" % self.__embedding_source.reference) logger.warning("The model will be trained on the %s field " "and the data will be processed with %s" % (field_name, preprocessor_list)) self.__embedding_source.fit(source, [field_name], preprocessor_list) # it iterates over all contents contained in the source in order to retrieve the raw data # the data contained in the field_name is processed using each information processor in the processor_list # the data is passed to the method that will create the single representation for content_data in source: processed_data = self.process_data(content_data[field_name], preprocessor_list) representation_list.append( self.produce_single_repr(processed_data)) return representation_list
def _prop_by_rep(self, content: Content, node: object, exo_rep: str, exo_props: List[str], row: dict): """ Private method that extracts from the 'content' loaded, the 'exo_props' passed from the 'exo_rep' passed, then creates a link between the 'node' passed and properties extracted. EXAMPLE: exo_rep = 0 exo_props = ['producer', 'director'] will extract the 'producer' and 'director' property from the representation '0' in the 'content' parameter and creates a link from the 'node' passed to said properties Args: content (Content): content loaded node (object): node to add properties to exo_rep (str): representation from where to extract the 'exo_props' exo_props (list): the properties list to extract from 'content' row (dict): dict-like object containing eventual score for the properties """ properties = None try: properties = content.get_exogenous_representation(exo_rep).value except KeyError: logger.warning("Representation " + exo_rep + " not found for content " + content.content_id) if properties is not None: for prop in exo_props: if prop in properties.keys(): preference = self.get_preference(prop, row) self.add_property_node(properties[prop]) self.add_link(node, properties[prop], preference, prop) else: logger.warning("Property " + prop + " not found for content " + content.content_id)
def _all_prop_in_rep(self, content, node, exo_rep, row): """ Private method that extracts from the 'content' loaded, ALL properties from the 'exo_rep' passed, then creates a link between the 'node' passed and properties extracted. EXAMPLE: exo_rep = 0 will extract ALL properties from the representation '0' in the 'content' parameter and creates a link from the 'node' passed to said properties Args: content (Content): content loaded node (object): node to add properties to exo_rep (str): representation from where to extract the 'exo_props' row (dict): dict-like object containing eventual score for the properties """ properties = None try: properties = content.get_exogenous_representation(exo_rep).value except KeyError: logger.warning("Representation " + exo_rep + " not found for content " + content.content_id) if properties is not None: for prop_key in properties.keys(): preference = self.get_preference(prop_key, row) self.add_property_node(properties[prop_key]) self.add_link(node, properties[prop_key], preference, prop_key) if len(properties) == 0: logger.warning("The chosen representation doesn't have any property!")
def remove_link(self, start_node: object, final_node: object): try: self._graph.remove_edge(start_node, final_node) except nx.NetworkXError: logger.warning( "No link exists between the start node and the final node!\n" "No link will be removed")
def get_successors(self, node: object) -> List[object]: """ Returns a list containing the successors of the node passed. Returns None if the node doesn't exists in the graph. Taken from networkx library: "A successor of n is a node m such that there exists a directed edge from n to m" EXAMPLE: U1 --> I2 ↓ I1 get_successors(u1) ---> [I1, I2] Args: node(object): node of which we want to calculate successors """ if not self.node_exists(node): logger.warning( "The node specified is not in the graph! Return None") return None else: return list(self.__graph.successors(node))
def add_link(self, start_node: object, final_node: object, weight: float = None, label: str = None): """ Creates a weighted link connecting the 'start_node' to the 'final_node' Both nodes must be present in the graph before calling this method 'weight' and 'label' are optional parameters, if not specified default values will be used. Args: start_node (object): starting node of the link final_node (object): ending node of the link weight (float): weight of the link, default is 0.5 label (str): label of the link, default is 'score_label' """ if label is None: label = self.get_default_score_label() if weight is None: weight = self.get_default_weight() if self.node_exists(start_node) and self.node_exists(final_node): # We must to this so that if the 'final' node passed is 'i1' and in the graph it's a 'ItemNode' # we get its instance and link the start node to the instance, otherwise networkx # links 'start' node to the string 'i1' and not the ItemNode!! nodes_list = list(self._graph.nodes) index_first = nodes_list.index(start_node) index_second = nodes_list.index(final_node) self._graph.add_edge(nodes_list[index_first], nodes_list[index_second], weight=weight, label=label) else: logger.warning("One of the nodes or both don't exist in the graph! Add them before " "calling this method.")
def calc_folds(self, labels: list): """ Private functions that check what number of folds should SVM classifier do. By default SVM does 5 folds, so if there are less ratings we decrease the number of folds because it would throw an exception otherwise. Every class should have min 2 rated items, otherwise no folds can be executed. EXAMPLE: labels = [1 1 0 1 0] We count how many different values there are in the list with collections.Counter(labels), so: count = {"1": 3, "0": 2} # There are 3 rated_items of class 1 # and 2 rated_items of class 0 Then we search the min value in the dict with min(count.values()): min_fold = 2 Args: labels: list of labels of the rated_items Returns: Number of folds to do. """ count = collections.Counter(labels) min_fold = min(count.values()) if min_fold < 2: logger.warning("There's too few rating for a class! There needs to be at least 2!\n" "No folds will be executed") elif min_fold >= 5: min_fold = 5 self.__folds = min_fold
def perform(self, split: Split): """ Calculates the correlation between the two frames and store the correlation plot Args: truth (pd.DataFrame): original rating frame used for recsys config predictions (pd.DataFrame): dataframe with recommendations for multiple users """ predictions = split.pred truth = split.truth # Calculating popularity by item items = truth[['to_id']].values.flatten() pop_by_items = Counter(items) # Calculating num of recommendations by item pop_by_items = pop_by_items.most_common() recs_by_item = Counter(predictions[['to_id']].values.flatten()) popularities = list() recommendations = list() popularities_no_zeros = list() recommendations_no_zeros = list() at_least_one_zero = False for item, pop in pop_by_items: num_of_recs = recs_by_item[item] popularities.append(pop) recommendations.append(num_of_recs) if num_of_recs != 0: popularities_no_zeros.append(pop) recommendations_no_zeros.append(num_of_recs) else: at_least_one_zero = True # Both when possible if self.__mode == 'both': self.build_w_zeros_plot(popularities, recommendations) if at_least_one_zero: self.build_no_zeros_plot(popularities_no_zeros, recommendations_no_zeros) else: logger.warning( "There's no zero recommendation!\n" "The graph with 'no-zero' won't be created, it would be identical to the 'w-zero' one!" ) elif self.__mode == 'w_zeros': self.build_w_zeros_plot(popularities, recommendations) elif self.__mode == 'no_zeros': self.build_no_zeros_plot(popularities_no_zeros, recommendations_no_zeros) return pd.DataFrame()
def _prop_in_all_rep(self, content, node, exo_props, row): """ Private method that extracts from the 'content' loaded, the 'exo_props' passed from ALL exo representation of the content, then creates a link between the 'node' passed and properties extracted. To avoid conflicts with multiple representations containing same properties, the properties extracted will be renamed as name_prop + exo_rep: EXAMPLE: exo_props = ['producer', 'director'] will extract 'producer' and 'director' properties from ALL exogenous representation in the 'content' parameter and creates a link from the 'node' passed to said properties. The properties will be renamed as 'producer_0', 'director_0', 'producer_1', 'director_1' if for example the content has those two properties in the 0 exogenous representation and 1 exogenous representation Args: content (Content): content loaded node (object): node to add properties to exo_props (list): the properties list to extract from 'content' row (dict): dict-like object containing eventual score for the properties """ properties = None properties_not_found = [] for rep in content.exogenous_rep_dict: for prop in exo_props: if prop in content.get_exogenous_rep(rep).value: if properties is None: properties = {} # properties = {director_0: aaaaa, director_1:bbbbb} properties[prop + "_" + rep] = content.get_exogenous_rep(rep).value[prop] else: properties_not_found.append(prop) if properties is not None: for prop_key in properties.keys(): # EX. producer_0 -> producer so I can search for preference # in the original frame source original_prop_name = '_'.join(prop_key.split('_')[:-1]) preference = self.get_preference(original_prop_name, row) self.add_property_node(properties[prop_key]) self.add_link(node, properties[prop_key], preference, prop_key) if len(properties_not_found) != 0: for prop in properties_not_found: logger.warning("Property " + prop + " not found for " + content.content_id) else: logger.warning("None of the property chosen was found for " + content.content_id)
def predict(self, ratings: pd.DataFrame, recs_number: int, items_directory: str, candidate_item_id_list: list = None): """ Finds the documents that the user liked by comparing the score given by the user to the item against the positive_threshold of the index_query object (if the rating is greater than the threshold, the document it refers to is considered liked by the user) After that, calls __recs_query to execute the prediction Args: ratings (pd.DataFrame): ratings of the user with id equal to user_id recs_number (int): how long the ranking will be items_directory (str): name of the directory where the items are stored candidate_item_id_list (list): list of the items that can be recommended, if None all unrated items will be used Returns: (pd.DataFrame) dataframe that for each row has a suggested item id and a rating of said item. This rating represents how much the item matches the query used for retrieving the recommendation list EXAMPLES: Find a recommendation list with two items for a user: predict('A000', ratings, 2, '../../example') Find a recommendation list with one item for a user considering a candidate list containing two items: predict('A000', ratings, 1, '../../example', ['tt0114885', 'tt0114388']) Ratings is a variable containing a dataframe with the user ratings Ratings dataframe columns example: "from_id", "to_id", "original_rating", "score", "timestamp" """ index_path = os.path.join(items_directory, 'search_index') if not DEVELOPING: index_path = os.path.join(home_path, items_directory, 'search_index') scores = [] positive_rated_document_list = [] for item_id, score in zip(ratings.to_id, ratings.score): if score > self.__positive_threshold: item = load_content_instance(items_directory, item_id) if item is not None: positive_rated_document_list.append(item.index_document_id) scores.append(score) try: return self.__recs_query(positive_rated_document_list, ratings.to_id, scores, recs_number, index_path, candidate_item_id_list) except (ValueError, EmptyIndexError) as e: logger.warning(str(e)) columns = ["to_id", "rating"] score_frame = pd.DataFrame(columns=columns) return score_frame
def remove_link(self, start_node: object, final_node: object): """ Removes the link connecting the 'start_node' to the 'final_node'. If there's no link between the two nodes, than a warning is printed Args: start_node (object): starting node of the link to remove final_node (object): ending node of the link to remove """ try: self._graph.remove_edge(start_node, final_node) except nx.NetworkXError: logger.warning("No link exists between the start node and the final node!\n" "No link will be removed")
def perform(self, split: Split): predictions = split.pred truth = split.truth # Calculating popularity by item items = truth[['to_id']].values.flatten() pop_by_items = Counter(items) # Calculating num of recommendations by item pop_by_items = pop_by_items.most_common() recs_by_item = Counter(predictions[['to_id']].values.flatten()) popularities = list() recommendations = list() popularities_no_zeros = list() recommendations_no_zeros = list() at_least_one_zero = False for item, pop in pop_by_items: num_of_recs = recs_by_item[item] popularities.append(pop) recommendations.append(num_of_recs) if num_of_recs != 0: popularities_no_zeros.append(pop) recommendations_no_zeros.append(num_of_recs) else: at_least_one_zero = True # Both when possible if self.__mode == 'both': self.build_w_zeros_plot(popularities, recommendations) if at_least_one_zero: self.build_no_zeros_plot(popularities_no_zeros, recommendations_no_zeros) else: self.build_no_zeros_plot(popularities, recommendations) logger.warning( "There's no zero recommendation!\n" "The graph with 'no-zero' is identical to the 'w-zero' one!" ) elif self.__mode == 'w_zeros': self.build_w_zeros_plot(popularities, recommendations) elif self.__mode == 'no_zeros': self.build_no_zeros_plot(popularities_no_zeros, recommendations_no_zeros) return pd.DataFrame()
def _prop_in_all_rep(self, content, node, exo_props, row): """ Private method that extracts from the 'content' loaded, the 'exo_props' passed from ALL exo representation of the content, then creates a link between the 'node' passed and properties extracted. To avoid conflicts with multiple representations containing same properties, the properties extracted will be renamed as name_prop + exo_rep: EXAMPLE: exo_props = ['producer', 'director'] will extract 'producer' and 'director' properties from ALL exogenous representation in the 'content' parameter and creates a link from the 'node' passed to said properties. The properties will be renamed as 'producer_0', 'director_0', 'producer_1', 'director_1' if for example the content has those two properties in the 0 exogenous representation and 1 exogenous representation Args: content (Content): content loaded node (object): node to add properties to exo_props (list): the properties list to extract from 'content' row (dict): dict-like object containing eventual score for the properties """ internal_id_list = content.exogenous_rep_container.get_internal_index() external_id_list = content.exogenous_rep_container.get_external_index() for prop in exo_props: property_found = False for id_int, id_ext in zip(internal_id_list, external_id_list): if prop in content.get_exogenous_representation(id_int).value: property_found = True # edge_label = director#0#dbpedia, director#1#datasetlocal # OR edge_label = director#0, edge_label = director#1 if external id is NaN edge_label = "{}#{}".format(prop, str(id_int)) if pd.notna(id_ext): edge_label += '#{}'.format(id_ext) property_node = content.get_exogenous_representation(id_int).value[prop] # search preference for the property in the original frame source preference = self.get_preference(prop, row) self.add_property_node(property_node) self.add_link(node, property_node, preference, edge_label) if not property_found: logger.warning("Property {} not found in any representation of content {}".format(prop, content.content_id))
def add_user_tree(self, user_node: object): """ Add a 'user' node if is not in the graph and load properties from disk if the node has some The method will try to load the content from the 'user_contents_dir' and extract from the loaded content the properties specified in the constructor (user_exo_representation, user_exo_properties) Args: user_node (object): 'user' node to add to the graph with its properties """ self.add_user_node(user_node) if self.get_user_contents_dir() is not None: self._add_usr_properties({'from_id': user_node}) else: logger.warning("The dir is not specified! The node will be added with no " "properties")
def get_predecessors(self, node: object) -> List[object]: """ Returns a list containing the successors of the node passed. Returns None if the node doesn't exists in the graph. Taken from networkx library: "A predecessor of n is a node m such that there exists a directed edge from m to n" EXAMPLE: I1 <-- U1 ↑ U2 get_successors(I1) ---> [U1, U2] Args: node(object): node of which we want to calculate predecessors """ if not self.node_exists(node): logger.warning("The node specified is not in the graph! Return None") else: return list(self._graph.predecessors(node))
def predict(self, ratings: pd.DataFrame, recs_number: int, items_directory: str, candidate_item_id_list: List = None) -> pd.DataFrame: """ After computing the centroid of the positive rated items by the user and getting the similarity scores of said centroid compared with every unrated item, creates and returns a recommendation list of unrated items ordered by their similarity score with the centroid. A candidate_item_id_list can be passed which will be used instead of the unrated items. EXAMPLE: Creates a recommendation list of length 1 with the similarity to the centroid as score, only considering the item tt0114319 instead of all the unrated items. (Ratings is a DataFrame containing the ratings given by the user) predict(ratings=ratings, recs_number=1, items_directory='.../somedir', candidate_item_id_list=['tt0114319']) Args: candidate_item_id_list (list): list of the items that can be recommended, if None all unrated items will be used recs_number (int): how long the ranking will be ratings (pd.DataFrame): ratings of a user items_directory (str): name of the directory where the items are stored. Returns: scores (pd.DataFrame): DataFrame whose columns are the ids of the items (to_id), and the similarities between the items and the centroid (rating) """ # Loads the items and extracts features from the unrated items, then # extracts features from the positive rated items # If exception, returns an empty score_frame try: rated_items, unrated_items, unrated_features_bag_list = \ super().preprocessing(items_directory, ratings, candidate_item_id_list) positive_rated_features_bag_list = self.__calc_positive_rated_baglist( rated_items, ratings) except (ValueError, FileNotFoundError) as e: logger.warning(str(e)) columns = ["to_id", "rating"] score_frame = pd.DataFrame(columns=columns) return score_frame logger.info("Computing rated items centroid") positive_rated_items_array = transform( self.__transformer, positive_rated_features_bag_list) centroid = np.array(positive_rated_items_array).mean(axis=0) columns = ["to_id", "rating"] score_frame = pd.DataFrame(columns=columns) logger.info("Computing similarity between centroid and unrated items") unrated_items_array = transform(self.__transformer, unrated_features_bag_list) similarities = [ self.__similarity.perform(centroid, item) for item in unrated_items_array ] for item, similarity in zip(unrated_items, similarities): score_frame = pd.concat([ score_frame, pd.DataFrame.from_records([(item.content_id, similarity)], columns=columns) ], ignore_index=True) score_frame = score_frame.sort_values( ['rating'], ascending=False).reset_index(drop=True) score_frame = score_frame[:recs_number] return score_frame
def predict(self, ratings: pd.DataFrame, recs_number: int, items_directory: str, candidate_item_id_list: List = None) -> pd.DataFrame: """ Get recommendations for a specified user. You must pass the the DataFrame which contains the ratings of the user, how many recommended item the method predict() must return, and the path of the items. If recommendation for certain item is needed, specify them in candidate_item_id_list parameter. In this case, the recommender system will return only scores for the items in the list, ignoring the recs_number parameter. EXAMPLE # Instantiate the ClassifierRecommender object, check its documentation if needed alg = ClassifierRecommender(...) # Get 5 most recommended items for the user 'AOOO' alg.predict('A000', rat, 5, path) # Get the score for the item 'tt0114576' for the user 'A000' alg.predict('A000', ratings, 1, path, ['tt0114576']) Args: ratings (pd.DataFrame): ratings of the user with id equal to user_id recs_number (int): How long the ranking will be items_directory (str): Path to the directory where the items are stored. candidate_item_id_list: list of the items that can be recommended, if None all unrated items will be used Returns: The predicted classes, or the predict values. """ # Loads the items and extracts features from the unrated items, then # calculates labels and extracts features from the rated items # If exception, returns an empty score_frame try: rated_items, unrated_items, unrated_features_bag_list = \ super().preprocessing(items_directory, ratings, candidate_item_id_list) labels, rated_features_bag_list = self.__calc_labels_rated_baglist(rated_items, ratings) except(ValueError, FileNotFoundError) as e: logger.warning(str(e)) columns = ["to_id", "rating"] score_frame = pd.DataFrame(columns=columns) return score_frame # If the classifier chosen is SVM we calc how many folds the classifier # can do. If no folds is possible, no folds will be executed if isinstance(self.__classifier, SVM): self.__classifier.calc_folds(labels) self.__classifier.fit(rated_features_bag_list, labels) columns = ["to_id", "rating"] score_frame = pd.DataFrame(columns=columns) logger.info("Predicting scores") score_labels = self.__classifier.predict_proba(unrated_features_bag_list) for score, item in zip(score_labels, unrated_items): score_frame = pd.concat( [score_frame, pd.DataFrame.from_records([(item.content_id, score[1])], columns=columns)], ignore_index=True) score_frame = score_frame.sort_values(['rating'], ascending=False).reset_index(drop=True) score_frame = score_frame[:recs_number] return score_frame
def fit(self): """ This method performs the evaluation by initializing internally a recommender system that produces recommendations for all the users in the directory specified in the configuration phase. The evaluation is performed by creating a training set, and a test set with its corresponding truth base. The ranking algorithm will use the test set as candidate items list. Returns: ranking_metric_results: has a 'from' column, representing the user_ids for which the metrics was computed, and then one different column for every metric performed. The returned DataFrames contain one row per user, and the corresponding metric values are given by the mean of the values obtained for that user. """ # initialize recommender to call for prediction computing recsys = RecSys(self.config) # get all users in specified directory logger.info("Loading user instances") user_id_list = \ [os.path.splitext(filename)[0] for filename in os.listdir(self.config.users_directory)] # define empty structure which will contain # the results ranking_alg_metrics_results = pd.DataFrame() # calculate metrics on ranking algorithm results if self.config.ranking_algorithm is None: raise ValueError( "You must set ranking algorithm to compute ranking metrics") for user_id in user_id_list: logger.info("Computing ranking metrics for user %s", user_id) user_ratings = self.config.rating_frame[ self.config.rating_frame['from_id'] == user_id] user_ratings = remove_not_existent_items( user_ratings, self.config.items_directory) try: self.partitioning.dataframe = user_ratings except ValueError as e: logger.warning(e) logger.warning( "The user %s doesn't have enough valid ratings. " "The user will be skipped", user_id) continue for partition_index in self.partitioning: result_dict = {} train = user_ratings.iloc[partition_index[0]] test = user_ratings.iloc[partition_index[1]] truth = test.loc[:, 'to_id':'score'] truth.columns = ["to_id", "rating"] recs_number = len(truth['rating'].values) predictions = recsys.fit_eval_ranking(train, truth['to_id'].tolist(), recs_number) for metric in self.metrics: result_dict['from'] = user_id result_dict[str(metric)] = metric.perform( predictions, truth) ranking_alg_metrics_results = \ ranking_alg_metrics_results.append(result_dict, ignore_index=True) ranking_alg_metrics_results = \ ranking_alg_metrics_results.groupby('from').mean().reset_index() return ranking_alg_metrics_results
def split_user_in_groups(score_frame: pd.DataFrame, groups: Dict[str, float], pop_items: Set[str]) -> Dict[str, Set[str]]: """ Splits the DataFrames in 3 different Sets, based on the recommendation popularity of each user Args: score_frame (pd.DataFrame): DataFrame with columns = ['from_id', 'to_id', 'rating'] groups (Dict[str, float]): each key contains the name of the group and each value contains the percentage of the specified group. If the groups don't cover the entire user collection, the rest of the users are considered in a 'default_diverse' group pop_items (Set[str]): set of most popular 'to_id' labels Returns: groups_dict (Dict<str, Set<str>>): key = group_name, value = Set of 'from_id' labels """ num_of_users = len(set(score_frame['from_id'])) if num_of_users < len(groups): raise NotEnoughUsers("You can't split in {} groups {} users! " "Try reducing number of groups".format( len(groups), num_of_users)) for percentage_chosen in groups.values(): if not 0 < percentage_chosen <= 1: raise PercentageError( 'Incorrect percentage! Valid percentage range: 0 < percentage <= 1' ) total = sum(groups.values()) if total > 1: raise PercentageError( "Incorrect percentage! Sum of percentage is > than 1") elif total < 1: logger.warning( "Sum of percentage is < than 1, " "the {} percentage of users will be inserted into the " "'default_diverse' group".format(1 - total)) pop_ratio_by_users = pop_ratio_by_user(score_frame, most_pop_items=pop_items) pop_ratio_by_users.sort_values(['popularity_ratio'], inplace=True, ascending=False) groups_dict: Dict[str, Set[str]] = {} last_index = 0 percentage = 0.0 for group_name in groups: percentage += groups[group_name] group_index = round(num_of_users * percentage) if group_index == 0: logger.warning( 'Not enough rows for group {}! It will be discarded'. format(group_name)) else: groups_dict[group_name] = set( pop_ratio_by_users['from_id'][last_index:group_index]) last_index = group_index if percentage < 1: group_index = round(num_of_users) groups_dict['default_diverse'] = set( pop_ratio_by_users['from_id'][last_index:group_index]) return groups_dict
def predict(self, ratings: pd.DataFrame = None, recs_number: int = 10, items_directory: str = None, candidate_item_id_list: List = None): """ Creates a recommendation list containing the top items retrieved by the PageRank algorithm. Networkx provides a method to compute PageRank on networkx graphs. Two types of PageRank computations are possible. The first one, in case the ranking is made for a user, will be PageRank with Priors considering the user profile as personalization vector. The second one, in case no user is defined (empty ratings or None) will be standard PageRank. If only a subset of the user ratings is passed as an argument, the graph will be pruned from the links representing the ratings not considered in said subset. For any case in which the graph will be modified (such as Feature Selection), a copy of the original graph will be created, so that the original graph may be preserved for future operations. It's also possible to include a candidate_item_id_list, in order to consider in the ranking only nodes specified in that list. Exceptions are thrown if raised by the feature selection algorithms or if a recommendations number <= 0 is chosen, in these cases an empty recommendation list will be returned. Args: ratings (pd.Dataframe): ratings of the user for which compute the prediction, if None or empty dataframe standard PageRank will be computed instead of personalized PageRank recs_number (int): length of the recommendation list items_directory (str): not used candidate_item_id_list (list): if a candidate list is specified, only items in the candidate list will be considered for recommendations (also ignoring the recommendations number) Returns: score_frame (pd.Dataframe): dataframe containing the recommendation list """ try: graph = self.fullgraph if recs_number <= 0: raise ValueError("You must set a valid number of recommendations (> 0) in order to compute PageRank") if candidate_item_id_list is None: candidate_item_id_list = [] if ratings is None: ratings = pd.DataFrame() if len(ratings) != 0: user_id = ratings['from_id'].iloc[0] personalized = True # in case only a subset of ratings from the user is passed, first of all it checks that # the ratings in the dataframe are a subset of the ratings in the graph's user profile # this is done to check that there aren't items rated by the user in the dataframe # but not in the graph's user profile user_ratings = set(ratings['to_id'].values) user_graph = set([node for node in graph.get_successors(user_id) if graph.is_item_node(node)]) if not user_ratings.issubset(user_graph): raise ValueError("There are ratings in the dataframe not available in the graph for the user") # after that it check if the ratings in the dataframe are equal to the ratings in the # graph's user profile. If they are equal no further operation is done, otherwise # the graph is simplified so that only items considered in the dataframe are # represented in the graph if not user_ratings == user_graph: additional_nodes = user_graph.difference(user_ratings) graph = deepcopy(self.fullgraph) logger.warning("The ratings passed are less than the ratings in the graph's user profile.\n" "The graph will be pruned in order to consider only the ratings passed") self.remove_links_for_user(graph, additional_nodes, user_id) else: personalized = False user_id = None # if the item or the user feature selection algorithms are instantiated it initializes the list of nodes # to consider in the feature selection process (which are nodes not referred in the user_ratings, so it # doesn't consider the user who the ratings refer to and the items that he voted) and performs the # feature selection which will return a list for the new properties to consider (one list for items and # one for users) if self.__item_feature_selection_algorithm is not None: logger.info('Computing feature selection on items') if len(ratings) != 0: recommended_items = list(set(ratings['to_id'])) else: recommended_items = [] recommended_items = [item for item in graph.item_nodes if item not in recommended_items] new_item_prop = self.__item_feature_selection_algorithm.perform(graph, recommended_items) else: new_item_prop = graph.get_item_exogenous_properties() if self.__user_feature_selection_algorithm is not None: logger.info('Computing feature selection on users') if len(ratings) != 0: recommended_users = list(set(ratings['from_id'])) else: recommended_users = [] recommended_users = [user for user in graph.user_nodes if user not in recommended_users] new_user_prop = self.__user_feature_selection_algorithm.perform(graph, recommended_users) else: new_user_prop = graph.get_user_exogenous_properties() # the lists created by the feature selection algorithms will be used to remove nodes from the graph so that # only the specified user and/or item exogenous properties will be considered if self.__user_feature_selection_algorithm is not None or\ self.__item_feature_selection_algorithm is not None: if graph is self.fullgraph: graph = deepcopy(self.fullgraph) nodes_to_remove = set() for property_node in graph.property_nodes: for predecessor in graph.get_predecessors(property_node): label = graph.get_link_data(predecessor, property_node)['label'] label = '_'.join(label.split('_')[:-1]) if (new_item_prop is not None and label not in new_item_prop) and\ (new_user_prop is not None and label not in new_user_prop): nodes_to_remove.add(property_node) graph._graph.remove_nodes_from(nodes_to_remove) # runs the PageRank either the personalized through the user profile or the standard one if personalized: profile = self.extract_profile(user_id, graph) if sum(profile.values()) == 0.0: logger.warning("Cannot compute personalized PageRank if all the weights are the minimum " "possible value, standard PageRank will be calculated instead") scores = nx.pagerank(graph._graph) else: scores = nx.pagerank(graph._graph.to_undirected(), personalization=profile) else: scores = nx.pagerank(graph._graph) # cleans the results removing nodes (they can be user nodes, items in the user profile and properties) scores = self.clean_rank(scores, graph, user_id) scores = dict(sorted(scores.items(), key=lambda item: item[1], reverse=True)) if len(candidate_item_id_list) == 0: ks = list(scores.keys()) ks = ks[:recs_number] else: ks = candidate_item_id_list new_scores = {k: scores[k] for k in scores.keys() if k in ks} columns = ["to_id", "rating"] score_frame = pd.DataFrame(columns=columns) for item, score in new_scores.items(): score_frame = pd.concat( [score_frame, pd.DataFrame.from_records([(item.value, score)], columns=columns)], ignore_index=True) return score_frame except ValueError as e: logger.warning(str(e)) columns = ["to_id", "rating"] score_frame = pd.DataFrame(columns=columns) return score_frame