コード例 #1
0
    def test_cluster_with_one_vector(self):
        """
        Test that the centroid of a cluster with a single vector has an equivalent centroid.
        """

        v = Document("a", ["a", "b", "a", "c"], scheme=TF())
        v.normalize()
        c = Cluster(v)
        self.assertEqual(v.dimensions, c.centroid.dimensions)
コード例 #2
0
    def test_get_centroid(self):
        """
        Test getting the centroid.
        """

        v = Document("", ["a", "c"], scheme=TF())
        v.normalize()
        c = Cluster(v)
        self.assertTrue(
            all(
                round(v.dimensions[dimension], 10) == round(
                    c.centroid.dimensions[dimension], 10)
                for dimension in v.dimensions.keys() | c.centroid.dimensions))
コード例 #3
0
    def _add_to_graph(self, graph, outgoing_links, threshold=0):
        """
        Add the links to the graph.
        The function fetches the article text and uses it to add to the weighted graph.

        .. note::

            The weight of edges is `1 - similarity`.
            The higher the similarity, the less weight.
            Therefore more paths go through that edge.

        :param graph: The graph to which to add the new nodes and edges.
        :type graph: :class:`~nx.Graph`
        :param outgoing_links: The dictionary of links.
                               The keys should be the source articles.
                               The values should be the outgoing links from these articles.
        :type outgoing_links: dict
        :param threshold: The minimum similarity between the source and target articles to add an edge between them.
        :type threshold: float
        """
        """
        Get the text from all articles.
        """
        sources = list(outgoing_links.keys())
        targets = [
            link for link_set in outgoing_links.values() for link in link_set
        ]
        articles = text.collect(sources + targets, introduction_only=True)
        """
        Convert each article into a document.
        The article is based only on the first sentence.
        """
        documents = {}
        for title, introduction in articles.items():
            introduction = self._remove_brackets(introduction)
            introduction = self._get_first_sentence(introduction)
            document = Document(introduction,
                                self.tokenizer.tokenize(introduction),
                                scheme=self.scheme)
            document.normalize()
            documents[title] = document
        """
        Add first the nodes, and then the edges to the graph.
        This is done by going through all the outgoing links.
        If they have a page, the similarity between the source article and that link is computed.
        If the similarity exceeds the threshold, add an edge between the two.
        """
        for source, targets in outgoing_links.items():
            if source not in documents:
                continue

            if source not in graph.nodes:
                graph.add_node(source, document=documents[source])

            for target in targets:
                if target not in documents:
                    continue

                if target not in graph.nodes:
                    graph.add_node(target, document=documents[target])

                if source in documents and target in documents:
                    similarity = vector_math.cosine(documents[source],
                                                    documents[target])
                    if similarity > threshold:
                        graph.add_edge(source, target, weight=(1 - similarity))