Esempio n. 1
0
    def extract_profile(graph: FullGraph, user_id: str) -> Dict:
        """
        Extracts the user profile (the items that the user rated, or in general the nodes with a link to the user).

        Returns a dictionary containing the successor nodes as keys and the weights in the graph for the edges between the user node
        and his successors as values

        EXAMPLE::
             graph: i1 <---0.2--- u1 ---0.4---> i2

            > print(extract_profile('u1'))
            > {'i1': 0.2, 'i2': 0.4}

        Args:
            user_id (str): id for the user for which the profile will be extracted
        Returns:
            profile (dict): dictionary with item successor nodes to the user as keys and weights of the edge
                connecting them in the graph as values
        """
        succ = graph.get_successors(user_id)
        profile = {}
        for a in succ:
            link_data = graph.get_link_data(user_id, a)
            profile[a] = link_data['weight']
        return profile  # {t: w for (f, t, w) in adj}
    def __delete_property_nodes(original_graph: FullGraph,
                                properties_to_keep: List[object]) -> FullGraph:
        """
        Creates a copy of the original graph from which the Property nodes having links not defined in the
        properties_to_keep parameter will be deleted (these property nodes will be the ones for which the feature
        selection algorithm found the lowest 'importance' score for the property label of their links)

        Args:
            original_graph (FullGraph): the original graph used for Feature Selection
            properties_to_keep (list): list of properties that should be kept in the original graph.
            Note that properties are the labels in the edges connected to Property nodes (so not the Property nodes
            themselves)

        Returns:
            Graph on which the less important property nodes will be removed
        """
        nodes_to_remove = set()

        # nodes that have even one ingoing edge that is not in the properties_to_keep list are removed
        # note that these nodes should only have one type of label for each edge (that's why only the first predecessor
        # is being considered)
        # in cases where, for example, http://dbpedia.org/resource/Phil_Collins can be both a 'film director'
        # and a 'producer' (both property edge labels of the original graph) there should be two different nodes for it
        for property_node in original_graph.property_nodes:
            predecessor = original_graph.get_predecessors(property_node)[0]
            link = original_graph.get_link_data(predecessor, property_node)
            label = link['label']
            # in case of multiple representations
            label = label.split('#')[0] if '#' in label else label
            if label not in properties_to_keep:
                nodes_to_remove.add(property_node)

        graph = original_graph.copy()
        graph._remove_nodes_from_graph(nodes_to_remove)
        return graph
Esempio n. 3
0
    def remove_links_for_user(graph: FullGraph, nodes_to_remove: set, user_id: str):
        """
        Removes the links between the user node which represents the user_id passed as an argument and a subset of its
        successors defined in the nodes_to_remove argument. After this phase, any node in the graph without any
        predecessor is removed (meaning both the items and the property nodes that may not have any predecessor after
        removing one or more item nodes). This is useful in case a prediction considering only a subset of the items
        an user rated has to be done (in particular this may be the case with Partitioning techniques). If in such cases
        this phase wasn't done and the additional item nodes were simply masked, the results obtained by the prediction
        would be biased by the fact that these nodes and links still exist within the graph

        Args:
            graph (FullGraph): graph on which the links and/or nodes will be removed
            nodes_to_remove (set): set of successor nodes for a specific user for which the links between the user and
                each node will be removed
            user_id (str): string value representing the user_id to consider (used to retrieve the corresponding user
                node from the graph)
        """

        to_remove = set()
        for item_node in graph.get_successors(user_id):
            if item_node in nodes_to_remove:
                to_remove.add((user_id, item_node))
        graph._graph.remove_edges_from(to_remove)

        to_remove = set()
        for item_node in nodes_to_remove:
            if len(graph.get_predecessors(item_node)) == 0:
                to_remove.add(item_node)
                for property_node in graph.get_successors(item_node):
                    if len(graph.get_predecessors(property_node)) == 1:
                        to_remove.add(property_node)
        graph._graph.remove_nodes_from(to_remove)
Esempio n. 4
0
    def extract_profile(user_id: str, graph: FullGraph) -> Dict:
        """
        Extracts the user profile by accessing the node inside of the graph representing the user.
        Retrieves the item nodes to which the user gave a rating and returns a dictionary containing
        the successor nodes as keys and the weights in the graph for the edges between the user node
        and his successors as values

        Args:
            user_id (str): id for the user for which the profile will be extracted
            graph (FullGraph): graph from which the user profile will be extracted. In particular, the weights
                of the links connecting the user node representing the item and the successors will be
                extracted and will represent the values in the profile dictionary. A graph is passed instead
                of using the original graph in the class because the original graph isn't modified, so it isn't
                affected by modifications done during the prediction process (such as Feature Selection)

        Output example: if the user has rated two items ('I1', 'I2'), the user node corresponding to the user_id
        is selected (for example for user 'A') and each link connecting the user to the items is retrieved and the
        weight of said edge is extracted and added to the dictionary. If the weights of the edges A -> I1 and
        A -> I2 are respectively 0.2 and 0.4 the output will be a dictionary in the following form:
        {'I1': 0.2, 'I2': 0.4}

        Returns:
            profile (dict): dictionary with item successor nodes to the user as keys and weights of the edge
                connecting them in the graph as values
        """
        successors = graph.get_successors(user_id)
        profile = {}
        for successor in successors:
            link_data = graph.get_link_data(user_id, successor)
            profile[successor] = link_data['weight']
            logger.info('unpack %s, %s', str(successor), str(profile[successor]))
        return profile  # {t: w for (f, t, w) in adj}
Esempio n. 5
0
    def clean_rank(self, rank: Dict, graph: FullGraph, user_id: str = None) -> Dict:
        """
        Cleans a rank from all the nodes that are not requested. It's possible to remove user nodes,
        property nodes and item nodes, the latter if they are already in the user profile. This produces a filtered
        ranking with only the desired nodes inside of it. What is filtered depends by the
        attributes remove_user_nodes, remove_items_in_profile and remove_properties

        Args:
            rank (dict): dictionary representing the ranking (keys are nodes and values are their ranked score)
            graph (FullGraph): graph from which the user profile will be extracted
            user_id (str): id of the user used to extract his profile (if None the profile will be empty)

        Returns:
            new_rank (dict): dictionary representing the filtered ranking
        """
        if user_id is not None:
            extracted_profile = self.extract_profile(user_id, graph)
        else:
            extracted_profile = {}

        new_rank = {k: rank[k] for k in rank.keys()}
        for k in rank.keys():
            if self.__remove_user_nodes and graph.is_user_node(k):
                new_rank.pop(k)
            elif self.__remove_items_in_profile and graph.is_item_node(k) and k in extracted_profile.keys():
                new_rank.pop(k)
            elif self.__remove_properties and graph.is_property_node(k):
                new_rank.pop(k)

        return new_rank
Esempio n. 6
0
    def create_new_graph_nx(
            graph: FullGraph,
            target_nodes: Union[List[UserNode], List[ItemNode]]) -> nx.DiGraph:
        """
        Creates a NetworkX directed graph from the original graph and the target nodes list passed as argument

        Args:
            graph (FullGraph): the original graph to apply Feature Selection on
            target_nodes (list[object]): list of user or item nodes from the original graph to consider for the creation
                of the new graph (for example, only items recommendable to the active user in a ranking algorithm should
                be considered)

        Returns:
            mew_graph (DiGraph): new graph that will be created from the original graph and the list of its nodes to
                consider
        """
        new_graph = nx.DiGraph()

        # for each node in the target list, if it has any property node in the original graph, the target node will
        # be added together with all the property labels it is connected with in the graph. The property labels will be
        # turned into Property nodes in the new graph. A link between the target node and each new Property node will
        # be instantiated
        for node in target_nodes:
            for successor_node in graph.get_successors(node):
                if graph.is_property_node(successor_node):
                    link_data = graph.get_link_data(node, successor_node)
                    new_graph.add_edge(node,
                                       PropertyNode(link_data['label']),
                                       weight=link_data['weight'])
        return new_graph
Esempio n. 7
0
    def __init__(self, source_frame: pd.DataFrame, user_contents_dir: str = None, item_contents_dir: str = None,
                 user_exo_properties: List[str] = None, user_exo_representation: Union[str, int] = None,
                 item_exo_properties: List[str] = None, item_exo_representation: Union[str, int] = None,
                 default_score_label: str = 'score', default_not_rated_value: float = 0.5):

        FullGraph.__init__(self, source_frame=source_frame,
                           user_contents_dir=user_contents_dir, item_contents_dir=item_contents_dir,
                           user_exo_properties=user_exo_properties, user_exo_representation=user_exo_representation,
                           item_exo_properties=item_exo_properties, item_exo_representation=item_exo_representation,
                           default_score_label=default_score_label, default_weight=default_not_rated_value)
    def assertPropNumber(self, result: FullGraph, user_or_item_nodes: set, expected_prop_number: int):
        # used to check that the expected number of properties matches the number of properties in the graph
        # returned by the FeatureSelectionHandler

        actual_prop = set()

        for node in user_or_item_nodes:
            for successor in result.get_successors(node):
                if result.is_property_node(successor):
                    property_label = result.get_link_data(node, successor)['label']
                    actual_prop.add(property_label)

        if expected_prop_number != len(actual_prop):
            raise AssertionError("Expected %s properties but %s found" % (expected_prop_number, len(actual_prop)))
    def _get_property_labels_info(
            graph: FullGraph,
            nodes_to_get_properties: Iterable[object]) -> list:
        """
        This method retrieves the properties (which in the graph are property labels) and returns them in a list. It's
        possible to define a custom Iterable of nodes from the FullGraph from which properties will be extracted.

        Note that in case of multiple representations, this function will return the properties in their basic form.
        So, for example:

            [starring#0#dbpedia, producer#0#dbpedia, ...] -> [starring, producer, ...]

        Args:
            graph (FullGraph): the original graph from which the properties will be extracted
            nodes_to_get_properties (Iterable): iterable containing the nodes in the graph from which the properties
                will be extracted

        Returns:
            properties (list): list containing the properties from the original graph for the Iterable of nodes
                passed as argument
        """
        properties = list()

        # retrieves the property nodes (successors) from the original graph. For each node in the target list
        # retrieves the data regarding the link between the node in the target list and each property.
        for node in nodes_to_get_properties:
            for successor_node in graph.get_successors(node):
                if graph.is_property_node(successor_node):
                    property_label = graph.get_link_data(
                        node, successor_node)['label']
                    # in case of multiple representations
                    property_label = property_label.split(
                        '#')[0] if '#' in property_label else property_label
                    if property_label not in properties:
                        properties.append(property_label)
        return properties
    def process_feature_selection_on_fullgraph(
            self, graph: FullGraph, user_target_nodes: List[object],
            item_target_nodes: List[object]) -> FullGraph:
        """
        Given a FullGraph, this method performs feature selection on said graph. It also allows to define a custom list
        of user and item nodes which properties will be considered during the feature selection process (instead of
        using the whole set of user and item nodes).

        Args:
            graph (FullGraph): original graph on which feature selection will be performed
            user_target_nodes (list): list of user nodes (or values of said nodes) to consider in the feature selection
                process
            item_target_nodes (list): list of item nodes (or values of said nodes) to consider in the feature selection
                process

        Returns:
            Copy of the original graph from which the less important Property nodes (the ones having edges with less
            important property labels) will be removed
        """

        if any(not graph.is_user_node(node) for node in user_target_nodes):
            raise FeatureSelectionException(
                'All nodes in user_target_nodes list must be user nodes')

        if any(not graph.is_item_node(node) for node in item_target_nodes):
            raise FeatureSelectionException(
                'All nodes in item_target_nodes list must be item nodes')

        if any(not isinstance(node, UserNode) for node in user_target_nodes):
            user_target_nodes = [
                UserNode(node) if not isinstance(node, UserNode) else node
                for node in user_target_nodes
            ]

        if any(not isinstance(node, ItemNode) for node in item_target_nodes):
            item_target_nodes = [
                ItemNode(node) if not isinstance(node, ItemNode) else node
                for node in item_target_nodes
            ]

        properties_to_keep = list()
        user_fs_failed = False
        item_fs_failed = False

        recsys_logger.info("Performing Feature Selection on users")
        try:
            properties_to_keep.extend(
                self.__feature_selection_algorithm.perform(
                    graph, user_target_nodes))
        except FeatureSelectionException as e:
            recsys_logger.warning(
                str(e) + "! Users original properties will be kept")
            user_fs_failed = True

        recsys_logger.info("Performing Feature Selection on items")
        try:
            properties_to_keep.extend(
                self.__feature_selection_algorithm.perform(
                    graph, item_target_nodes))
        except FeatureSelectionException as e:
            recsys_logger.warning(
                str(e) + "! Items original properties will be kept")
            item_fs_failed = True

        # in case user feature selection or item feature selection failed
        # if both failed the original graph is returned
        # if only one of them failed, the original properties (either for items or users) are retrieved
        if user_fs_failed and item_fs_failed:
            recsys_logger.warning(
                "Since items and users original properties will be kept, "
                "the original graph will be returned")
            return graph
        elif user_fs_failed and not item_fs_failed:
            properties_to_keep.extend(
                self._get_property_labels_info(graph, graph.user_nodes))
        elif not user_fs_failed and item_fs_failed:
            properties_to_keep.extend(
                self._get_property_labels_info(graph, graph.item_nodes))

        return self.__delete_property_nodes(graph, properties_to_keep)
Esempio n. 11
0
    def perform(self, X: FullGraph, y: List[object]) -> List[str]:
        """
        In order to remove features from the graph (which in this case are properties either for user or item nodes),
        a new networkx directed graph is instantiated. This graph will have two types of nodes: the first being either
        item or user nodes, the second being property nodes (but instead of having a value like a DBPedia URI, these
        nodes will have the property label as value). Each item/user node will be connected to the property nodes for
        which they have a valid value in their representation.

        EXAMPLE: the networkx graph will have the following edge:

        tt0112453 (node representing the item tt0112453) -> starring (PropertyNode representing the label 'starring')

        if tt0112453 is connected to a PropertyNode in the original graph that represents a value for the
        label starring (example: in the original graph tt0112453 (ItemNode) -> http://dbpedia.org/resource/Phil_Collins
        (PropertyNode) and the edge between them has the label 'starring').

        After creating this new graph, PageRank will be run and the k properties with the highest value of PageRank
        will be extracted from the graph.

        In case of multiple representations for items or users appropriate measures are adopted in order to merge
        the representations (example: if there are 'starring_0' and 'starring_1' labels, these labels will be merged
        into the 'starring' label and their PageRank value will be summed up)

        Args:
            X (FullGraph): FullGraph representing the graph on which the feature selection technique will be done
            y (List[object]): can be a list containing either items or users in the graph, otherwise an exception is
            thrown. This list represents the target nodes that will be used in the graph to extract their properties.
            This can be useful in case only a subset of items or users is considered (for example, in the project we
            are considering only items not rated by the user)

        Returns:
            new_prop (List[str]): list containing the top k most meaningful property labels
             (example: ['starring', 'producer'])
        """

        if self.__k <= 0:
            return []

        new_graph = nx.DiGraph()

        # checks that all nodes in the target list are either user nodes or item nodes, an exception is thrown otherwise
        if all(X.is_item_node(node) for node in y):
            if X.get_item_exogenous_properties() is not None and self.__k >= len(X.get_item_exogenous_properties()):
                return X.get_item_exogenous_properties()
            representation = X.get_item_exogenous_representation()
        elif all(X.is_user_node(node) for node in y):
            if X.get_user_exogenous_properties() is not None and self.__k >= len(X.get_user_exogenous_properties()):
                return X.get_user_exogenous_properties()
            representation = X.get_user_exogenous_representation()
        else:
            raise ValueError("Target list must contain items or users of the corresponding graph")

        # retrieves the property nodes (successors) from the original graph. For each node in the target list
        # retrieves the data regarding the link between the node in the target list and each property.
        # Adds a new property node to the new graph with value being the link label of the original graph
        for node in y:
            for successor_node in X.get_successors(node):
                if X.is_property_node(successor_node):
                    new_property = X.get_link_data(node, successor_node)
                    new_graph.add_edge(node, PropertyNode(new_property['label']), weight=new_property['weight'])

        # computes PageRank and extracts all properties from it
        rank = nx.pagerank(new_graph.to_undirected())
        rank = {node.value: rank[node] for node in rank if isinstance(node, PropertyNode)}

        # in case multiple representations are considered, the ranking containing multiple representations
        # will be transformed into a ranking containing a single one
        # example: {'starring_0': 0.03, 'starring_1': 0.1, ...}
        # will be transformed into {'starring': 0.13, ...}
        if representation is None:
            new_rank = {}
            properties = set(rank.keys())
            properties = set('_'.join(property_name.split('_')[:-1]) for property_name in properties)
            for property_name in properties:
                properties_labels = [key for key in rank.keys() if property_name in key]
                new_rank[property_name] = 0
                for property_label in properties_labels:
                    new_rank[property_name] += rank[property_label]
            rank = new_rank

        # the ranking produced by the PageRank algorithm is sorted by values and the top k are extracted
        rank = dict(sorted(rank.items(), key=lambda item: item[1], reverse=True))
        rank = list(rank.keys())[:self.__k]

        return rank
Esempio n. 12
0
 def __init__(self,
              algorithm: GraphBasedAlgorithm,
              graph: FullGraph):
     self.__algorithm = algorithm
     self.__graph = graph
     super().__init__(rating_frame=graph.convert_to_dataframe())