def extract_profile(graph: FullGraph, user_id: str) -> Dict: """ Extracts the user profile (the items that the user rated, or in general the nodes with a link to the user). Returns a dictionary containing the successor nodes as keys and the weights in the graph for the edges between the user node and his successors as values EXAMPLE:: graph: i1 <---0.2--- u1 ---0.4---> i2 > print(extract_profile('u1')) > {'i1': 0.2, 'i2': 0.4} Args: user_id (str): id for the user for which the profile will be extracted Returns: profile (dict): dictionary with item successor nodes to the user as keys and weights of the edge connecting them in the graph as values """ succ = graph.get_successors(user_id) profile = {} for a in succ: link_data = graph.get_link_data(user_id, a) profile[a] = link_data['weight'] return profile # {t: w for (f, t, w) in adj}
def __delete_property_nodes(original_graph: FullGraph, properties_to_keep: List[object]) -> FullGraph: """ Creates a copy of the original graph from which the Property nodes having links not defined in the properties_to_keep parameter will be deleted (these property nodes will be the ones for which the feature selection algorithm found the lowest 'importance' score for the property label of their links) Args: original_graph (FullGraph): the original graph used for Feature Selection properties_to_keep (list): list of properties that should be kept in the original graph. Note that properties are the labels in the edges connected to Property nodes (so not the Property nodes themselves) Returns: Graph on which the less important property nodes will be removed """ nodes_to_remove = set() # nodes that have even one ingoing edge that is not in the properties_to_keep list are removed # note that these nodes should only have one type of label for each edge (that's why only the first predecessor # is being considered) # in cases where, for example, http://dbpedia.org/resource/Phil_Collins can be both a 'film director' # and a 'producer' (both property edge labels of the original graph) there should be two different nodes for it for property_node in original_graph.property_nodes: predecessor = original_graph.get_predecessors(property_node)[0] link = original_graph.get_link_data(predecessor, property_node) label = link['label'] # in case of multiple representations label = label.split('#')[0] if '#' in label else label if label not in properties_to_keep: nodes_to_remove.add(property_node) graph = original_graph.copy() graph._remove_nodes_from_graph(nodes_to_remove) return graph
def remove_links_for_user(graph: FullGraph, nodes_to_remove: set, user_id: str): """ Removes the links between the user node which represents the user_id passed as an argument and a subset of its successors defined in the nodes_to_remove argument. After this phase, any node in the graph without any predecessor is removed (meaning both the items and the property nodes that may not have any predecessor after removing one or more item nodes). This is useful in case a prediction considering only a subset of the items an user rated has to be done (in particular this may be the case with Partitioning techniques). If in such cases this phase wasn't done and the additional item nodes were simply masked, the results obtained by the prediction would be biased by the fact that these nodes and links still exist within the graph Args: graph (FullGraph): graph on which the links and/or nodes will be removed nodes_to_remove (set): set of successor nodes for a specific user for which the links between the user and each node will be removed user_id (str): string value representing the user_id to consider (used to retrieve the corresponding user node from the graph) """ to_remove = set() for item_node in graph.get_successors(user_id): if item_node in nodes_to_remove: to_remove.add((user_id, item_node)) graph._graph.remove_edges_from(to_remove) to_remove = set() for item_node in nodes_to_remove: if len(graph.get_predecessors(item_node)) == 0: to_remove.add(item_node) for property_node in graph.get_successors(item_node): if len(graph.get_predecessors(property_node)) == 1: to_remove.add(property_node) graph._graph.remove_nodes_from(to_remove)
def extract_profile(user_id: str, graph: FullGraph) -> Dict: """ Extracts the user profile by accessing the node inside of the graph representing the user. Retrieves the item nodes to which the user gave a rating and returns a dictionary containing the successor nodes as keys and the weights in the graph for the edges between the user node and his successors as values Args: user_id (str): id for the user for which the profile will be extracted graph (FullGraph): graph from which the user profile will be extracted. In particular, the weights of the links connecting the user node representing the item and the successors will be extracted and will represent the values in the profile dictionary. A graph is passed instead of using the original graph in the class because the original graph isn't modified, so it isn't affected by modifications done during the prediction process (such as Feature Selection) Output example: if the user has rated two items ('I1', 'I2'), the user node corresponding to the user_id is selected (for example for user 'A') and each link connecting the user to the items is retrieved and the weight of said edge is extracted and added to the dictionary. If the weights of the edges A -> I1 and A -> I2 are respectively 0.2 and 0.4 the output will be a dictionary in the following form: {'I1': 0.2, 'I2': 0.4} Returns: profile (dict): dictionary with item successor nodes to the user as keys and weights of the edge connecting them in the graph as values """ successors = graph.get_successors(user_id) profile = {} for successor in successors: link_data = graph.get_link_data(user_id, successor) profile[successor] = link_data['weight'] logger.info('unpack %s, %s', str(successor), str(profile[successor])) return profile # {t: w for (f, t, w) in adj}
def clean_rank(self, rank: Dict, graph: FullGraph, user_id: str = None) -> Dict: """ Cleans a rank from all the nodes that are not requested. It's possible to remove user nodes, property nodes and item nodes, the latter if they are already in the user profile. This produces a filtered ranking with only the desired nodes inside of it. What is filtered depends by the attributes remove_user_nodes, remove_items_in_profile and remove_properties Args: rank (dict): dictionary representing the ranking (keys are nodes and values are their ranked score) graph (FullGraph): graph from which the user profile will be extracted user_id (str): id of the user used to extract his profile (if None the profile will be empty) Returns: new_rank (dict): dictionary representing the filtered ranking """ if user_id is not None: extracted_profile = self.extract_profile(user_id, graph) else: extracted_profile = {} new_rank = {k: rank[k] for k in rank.keys()} for k in rank.keys(): if self.__remove_user_nodes and graph.is_user_node(k): new_rank.pop(k) elif self.__remove_items_in_profile and graph.is_item_node(k) and k in extracted_profile.keys(): new_rank.pop(k) elif self.__remove_properties and graph.is_property_node(k): new_rank.pop(k) return new_rank
def create_new_graph_nx( graph: FullGraph, target_nodes: Union[List[UserNode], List[ItemNode]]) -> nx.DiGraph: """ Creates a NetworkX directed graph from the original graph and the target nodes list passed as argument Args: graph (FullGraph): the original graph to apply Feature Selection on target_nodes (list[object]): list of user or item nodes from the original graph to consider for the creation of the new graph (for example, only items recommendable to the active user in a ranking algorithm should be considered) Returns: mew_graph (DiGraph): new graph that will be created from the original graph and the list of its nodes to consider """ new_graph = nx.DiGraph() # for each node in the target list, if it has any property node in the original graph, the target node will # be added together with all the property labels it is connected with in the graph. The property labels will be # turned into Property nodes in the new graph. A link between the target node and each new Property node will # be instantiated for node in target_nodes: for successor_node in graph.get_successors(node): if graph.is_property_node(successor_node): link_data = graph.get_link_data(node, successor_node) new_graph.add_edge(node, PropertyNode(link_data['label']), weight=link_data['weight']) return new_graph
def __init__(self, source_frame: pd.DataFrame, user_contents_dir: str = None, item_contents_dir: str = None, user_exo_properties: List[str] = None, user_exo_representation: Union[str, int] = None, item_exo_properties: List[str] = None, item_exo_representation: Union[str, int] = None, default_score_label: str = 'score', default_not_rated_value: float = 0.5): FullGraph.__init__(self, source_frame=source_frame, user_contents_dir=user_contents_dir, item_contents_dir=item_contents_dir, user_exo_properties=user_exo_properties, user_exo_representation=user_exo_representation, item_exo_properties=item_exo_properties, item_exo_representation=item_exo_representation, default_score_label=default_score_label, default_weight=default_not_rated_value)
def assertPropNumber(self, result: FullGraph, user_or_item_nodes: set, expected_prop_number: int): # used to check that the expected number of properties matches the number of properties in the graph # returned by the FeatureSelectionHandler actual_prop = set() for node in user_or_item_nodes: for successor in result.get_successors(node): if result.is_property_node(successor): property_label = result.get_link_data(node, successor)['label'] actual_prop.add(property_label) if expected_prop_number != len(actual_prop): raise AssertionError("Expected %s properties but %s found" % (expected_prop_number, len(actual_prop)))
def _get_property_labels_info( graph: FullGraph, nodes_to_get_properties: Iterable[object]) -> list: """ This method retrieves the properties (which in the graph are property labels) and returns them in a list. It's possible to define a custom Iterable of nodes from the FullGraph from which properties will be extracted. Note that in case of multiple representations, this function will return the properties in their basic form. So, for example: [starring#0#dbpedia, producer#0#dbpedia, ...] -> [starring, producer, ...] Args: graph (FullGraph): the original graph from which the properties will be extracted nodes_to_get_properties (Iterable): iterable containing the nodes in the graph from which the properties will be extracted Returns: properties (list): list containing the properties from the original graph for the Iterable of nodes passed as argument """ properties = list() # retrieves the property nodes (successors) from the original graph. For each node in the target list # retrieves the data regarding the link between the node in the target list and each property. for node in nodes_to_get_properties: for successor_node in graph.get_successors(node): if graph.is_property_node(successor_node): property_label = graph.get_link_data( node, successor_node)['label'] # in case of multiple representations property_label = property_label.split( '#')[0] if '#' in property_label else property_label if property_label not in properties: properties.append(property_label) return properties
def process_feature_selection_on_fullgraph( self, graph: FullGraph, user_target_nodes: List[object], item_target_nodes: List[object]) -> FullGraph: """ Given a FullGraph, this method performs feature selection on said graph. It also allows to define a custom list of user and item nodes which properties will be considered during the feature selection process (instead of using the whole set of user and item nodes). Args: graph (FullGraph): original graph on which feature selection will be performed user_target_nodes (list): list of user nodes (or values of said nodes) to consider in the feature selection process item_target_nodes (list): list of item nodes (or values of said nodes) to consider in the feature selection process Returns: Copy of the original graph from which the less important Property nodes (the ones having edges with less important property labels) will be removed """ if any(not graph.is_user_node(node) for node in user_target_nodes): raise FeatureSelectionException( 'All nodes in user_target_nodes list must be user nodes') if any(not graph.is_item_node(node) for node in item_target_nodes): raise FeatureSelectionException( 'All nodes in item_target_nodes list must be item nodes') if any(not isinstance(node, UserNode) for node in user_target_nodes): user_target_nodes = [ UserNode(node) if not isinstance(node, UserNode) else node for node in user_target_nodes ] if any(not isinstance(node, ItemNode) for node in item_target_nodes): item_target_nodes = [ ItemNode(node) if not isinstance(node, ItemNode) else node for node in item_target_nodes ] properties_to_keep = list() user_fs_failed = False item_fs_failed = False recsys_logger.info("Performing Feature Selection on users") try: properties_to_keep.extend( self.__feature_selection_algorithm.perform( graph, user_target_nodes)) except FeatureSelectionException as e: recsys_logger.warning( str(e) + "! Users original properties will be kept") user_fs_failed = True recsys_logger.info("Performing Feature Selection on items") try: properties_to_keep.extend( self.__feature_selection_algorithm.perform( graph, item_target_nodes)) except FeatureSelectionException as e: recsys_logger.warning( str(e) + "! Items original properties will be kept") item_fs_failed = True # in case user feature selection or item feature selection failed # if both failed the original graph is returned # if only one of them failed, the original properties (either for items or users) are retrieved if user_fs_failed and item_fs_failed: recsys_logger.warning( "Since items and users original properties will be kept, " "the original graph will be returned") return graph elif user_fs_failed and not item_fs_failed: properties_to_keep.extend( self._get_property_labels_info(graph, graph.user_nodes)) elif not user_fs_failed and item_fs_failed: properties_to_keep.extend( self._get_property_labels_info(graph, graph.item_nodes)) return self.__delete_property_nodes(graph, properties_to_keep)
def perform(self, X: FullGraph, y: List[object]) -> List[str]: """ In order to remove features from the graph (which in this case are properties either for user or item nodes), a new networkx directed graph is instantiated. This graph will have two types of nodes: the first being either item or user nodes, the second being property nodes (but instead of having a value like a DBPedia URI, these nodes will have the property label as value). Each item/user node will be connected to the property nodes for which they have a valid value in their representation. EXAMPLE: the networkx graph will have the following edge: tt0112453 (node representing the item tt0112453) -> starring (PropertyNode representing the label 'starring') if tt0112453 is connected to a PropertyNode in the original graph that represents a value for the label starring (example: in the original graph tt0112453 (ItemNode) -> http://dbpedia.org/resource/Phil_Collins (PropertyNode) and the edge between them has the label 'starring'). After creating this new graph, PageRank will be run and the k properties with the highest value of PageRank will be extracted from the graph. In case of multiple representations for items or users appropriate measures are adopted in order to merge the representations (example: if there are 'starring_0' and 'starring_1' labels, these labels will be merged into the 'starring' label and their PageRank value will be summed up) Args: X (FullGraph): FullGraph representing the graph on which the feature selection technique will be done y (List[object]): can be a list containing either items or users in the graph, otherwise an exception is thrown. This list represents the target nodes that will be used in the graph to extract their properties. This can be useful in case only a subset of items or users is considered (for example, in the project we are considering only items not rated by the user) Returns: new_prop (List[str]): list containing the top k most meaningful property labels (example: ['starring', 'producer']) """ if self.__k <= 0: return [] new_graph = nx.DiGraph() # checks that all nodes in the target list are either user nodes or item nodes, an exception is thrown otherwise if all(X.is_item_node(node) for node in y): if X.get_item_exogenous_properties() is not None and self.__k >= len(X.get_item_exogenous_properties()): return X.get_item_exogenous_properties() representation = X.get_item_exogenous_representation() elif all(X.is_user_node(node) for node in y): if X.get_user_exogenous_properties() is not None and self.__k >= len(X.get_user_exogenous_properties()): return X.get_user_exogenous_properties() representation = X.get_user_exogenous_representation() else: raise ValueError("Target list must contain items or users of the corresponding graph") # retrieves the property nodes (successors) from the original graph. For each node in the target list # retrieves the data regarding the link between the node in the target list and each property. # Adds a new property node to the new graph with value being the link label of the original graph for node in y: for successor_node in X.get_successors(node): if X.is_property_node(successor_node): new_property = X.get_link_data(node, successor_node) new_graph.add_edge(node, PropertyNode(new_property['label']), weight=new_property['weight']) # computes PageRank and extracts all properties from it rank = nx.pagerank(new_graph.to_undirected()) rank = {node.value: rank[node] for node in rank if isinstance(node, PropertyNode)} # in case multiple representations are considered, the ranking containing multiple representations # will be transformed into a ranking containing a single one # example: {'starring_0': 0.03, 'starring_1': 0.1, ...} # will be transformed into {'starring': 0.13, ...} if representation is None: new_rank = {} properties = set(rank.keys()) properties = set('_'.join(property_name.split('_')[:-1]) for property_name in properties) for property_name in properties: properties_labels = [key for key in rank.keys() if property_name in key] new_rank[property_name] = 0 for property_label in properties_labels: new_rank[property_name] += rank[property_label] rank = new_rank # the ranking produced by the PageRank algorithm is sorted by values and the top k are extracted rank = dict(sorted(rank.items(), key=lambda item: item[1], reverse=True)) rank = list(rank.keys())[:self.__k] return rank
def __init__(self, algorithm: GraphBasedAlgorithm, graph: FullGraph): self.__algorithm = algorithm self.__graph = graph super().__init__(rating_frame=graph.convert_to_dataframe())