Beispiel #1
0
 def test_cc_none_undirected(self):
     gb = GraphBuilder(Graph.UNDIRECTED)
     graph = gb.add("bobo", ["jack", "jill", "jane"]) \
         .build()
     self.assertEqual(graph.clustering_coefficients["bobo"], 0.0)
     self.assertEqual(graph.clustering_coefficients["jack"], 0.0)
     self.assertEqual(graph.clustering_coefficients["jill"], 0.0)
     self.assertEqual(graph.clustering_coefficients["jane"], 0.0)
     # Testing clustering coefficient expansion
     self.assertEqual(graph.neighbourhood("bobo", 0, False), set())
     self.assertEqual(graph.neighbourhood("bobo", 0, False, (1, 1.0)),
                      set([("jack", 1), ("jill", 1), ("jane", 1)]))
     self.assertEqual(graph.neighbourhood("jack", 0, False), set())
     self.assertEqual(graph.neighbourhood("jack", 0, False, (1, 1.0)),
                      set([("bobo", 1)]))
     self.assertEqual(graph.neighbourhood("jill", 0, False), set())
     self.assertEqual(graph.neighbourhood("jill", 0, False, (1, 1.0)),
                      set([("bobo", 1)]))
     self.assertEqual(graph.neighbourhood("jane", 0, False), set())
     self.assertEqual(graph.neighbourhood("jane", 0, False, (1, 1.0)),
                      set([("bobo", 1)]))
Beispiel #2
0
 def test_page_rank_biases(self):
     gb = GraphBuilder(Graph.UNDIRECTED)
     graph = gb.add("bobo", ["jack", "jill", "jane"]) \
         .add("jack", ["colt"]) \
         .add("jack", ["colt"]) \
         .add("jane", ["bobo"]) \
         .add("alik", ["peny"]) \
         .build()
     page_rank = graph.page_rank(biases={"jack": .5})
     self.assertTrue(math.isclose(page_rank["bobo"], 0.256, abs_tol=0.001),
                     page_rank["bobo"])
     self.assertTrue(math.isclose(page_rank["jack"], 0.174, abs_tol=0.001),
                     page_rank["jack"])
     self.assertTrue(math.isclose(page_rank["jill"], 0.093, abs_tol=0.001),
                     page_rank["jill"])
     self.assertTrue(math.isclose(page_rank["jane"], 0.093, abs_tol=0.001),
                     page_rank["jane"])
     self.assertTrue(math.isclose(page_rank["colt"], 0.096, abs_tol=0.001),
                     page_rank["jane"])
     self.assertTrue(math.isclose(page_rank["alik"], 0.142, abs_tol=0.001),
                     page_rank["alik"])
     self.assertTrue(math.isclose(page_rank["peny"], 0.142, abs_tol=0.001),
                     page_rank["peny"])
Beispiel #3
0
 def test_page_rank_directed(self):
     gb = GraphBuilder(Graph.DIRECTED)
     graph = gb.add("bobo", ["jack", "jill", "jane"]) \
         .add("jack", ["colt"]) \
         .add("jack", ["colt"]) \
         .add("jane", ["bobo"]) \
         .add("alik", ["peny"]) \
         .build()
     page_rank = graph.page_rank()
     self.assertTrue(math.isclose(page_rank["bobo"], 0.190, abs_tol=0.001),
                     page_rank["bobo"])
     self.assertTrue(math.isclose(page_rank["jack"], 0.131, abs_tol=0.001),
                     page_rank["jack"])
     self.assertTrue(math.isclose(page_rank["jill"], 0.131, abs_tol=0.001),
                     page_rank["jill"])
     self.assertTrue(math.isclose(page_rank["jane"], 0.131, abs_tol=0.001),
                     page_rank["jane"])
     self.assertTrue(math.isclose(page_rank["colt"], 0.190, abs_tol=0.001),
                     page_rank["jane"])
     self.assertTrue(math.isclose(page_rank["alik"], 0.078, abs_tol=0.001),
                     page_rank["alik"])
     self.assertTrue(math.isclose(page_rank["peny"], 0.144, abs_tol=0.001),
                     page_rank["peny"])
Beispiel #4
0
 def test_graph_builder_undirected(self):
     gb = GraphBuilder(Graph.UNDIRECTED)
     graph = gb.add("bobo", ["jack", "jill", "jane"]) \
         .add("jack", ["colt"]) \
         .add("jack", ["colt"]) \
         .add("alik", ["peny"]) \
         .build()
     self.assertEqual(graph.links(), set([UndirectedLink("bobo", "jack"), UndirectedLink("bobo", "jill"), \
         UndirectedLink("bobo", "jane"), UndirectedLink("jack", "colt"), UndirectedLink("alik", "peny")]))
     self.assertEqual(graph.global_max_distance(), 3)
     self.assertEqual(
         graph._max_distances, {
             "bobo": 2,
             "jack": 2,
             "jill": 3,
             "jane": 3,
             "colt": 3,
             "alik": 1,
             "peny": 1,
         })
     self.assertEqual(
         graph._distances, {
             "bobo": {
                 "bobo": 0,
                 "jack": 1,
                 "jill": 1,
                 "jane": 1,
                 "colt": 2,
                 "alik": None,
                 "peny": None,
             },
             "jack": {
                 "bobo": 1,
                 "jack": 0,
                 "jill": 2,
                 "jane": 2,
                 "colt": 1,
                 "alik": None,
                 "peny": None,
             },
             "jill": {
                 "bobo": 1,
                 "jack": 2,
                 "jill": 0,
                 "jane": 2,
                 "colt": 3,
                 "alik": None,
                 "peny": None,
             },
             "jane": {
                 "bobo": 1,
                 "jack": 2,
                 "jill": 2,
                 "jane": 0,
                 "colt": 3,
                 "alik": None,
                 "peny": None,
             },
             "colt": {
                 "bobo": 2,
                 "jack": 1,
                 "jill": 3,
                 "jane": 3,
                 "colt": 0,
                 "alik": None,
                 "peny": None,
             },
             "alik": {
                 "bobo": None,
                 "jack": None,
                 "jill": None,
                 "jane": None,
                 "colt": None,
                 "alik": 0,
                 "peny": 1,
             },
             "peny": {
                 "bobo": None,
                 "jack": None,
                 "jill": None,
                 "jane": None,
                 "colt": None,
                 "alik": 1,
                 "peny": 0,
             },
         })
Beispiel #5
0
def build(input_text, input_format, window, separator, keep):
    check.check_iterable(input_text)
    assert window > 0, window
    assert keep >= 0 and keep <= 100, keep
    parse = workbench.parser.parse_input(input_text, input_format, window,
                                         separator)
    builder = GraphBuilder(Graph.UNDIRECTED)
    count_histogram = {}

    for subd in parse.cooccurrences.values():
        for term_sentences in subd.values():
            if len(term_sentences) not in count_histogram:
                count_histogram[len(term_sentences)] = 0

            count_histogram[len(term_sentences)] += 1

    logging.debug("count_histogram: %s" % count_histogram)
    sub_lengths = [[] if len(subd) == 0 else [len(l) for l in subd.values()]
                   for subd in parse.cooccurrences.values()]

    if len(sub_lengths) > 0:
        maximum = max([0 if len(l) == 0 else max(l) for l in sub_lengths])
        minimum = min([0 if len(l) == 0 else min(l) for l in sub_lengths])
        average = sum([i for l in sub_lengths
                       for i in l]) / sum([len(l) for l in sub_lengths])
    else:
        maximum = 0
        minimum = 0
        average = 0.0

    bottom_percent = (100.0 - keep) / 100.0
    cutoff = max(int(maximum * bottom_percent), 1)
    logging.debug("maximum: %s, cutoff: %s" % (maximum, cutoff))
    occurring_sentences = {}
    excluded_lemmas = set()
    included_lemmas = set()

    for source, target_sentences in sorted(parse.cooccurrences.items()):
        #source = parse.inflections.to_dominant_inflection(term_a)
        excluded_lemmas.add(source)
        targets = {
            target: sentences
            for target, sentences in filter(
                lambda item: len(item[1]) >= cutoff, target_sentences.items())
        }
        #targets = {parse.inflections.to_dominant_inflection(term_b): sentences for term_b, sentences in filter(lambda item: len(item[1]) >= cutoff, term_sentences.items())}

        if len(targets) > 0:
            builder.add(source, [t for t in targets.keys()])
            included_lemmas.add(source)
            excluded_lemmas.remove(source)

            if source not in occurring_sentences:
                occurring_sentences[source] = {}

            for target, sentences in targets.items():
                included_lemmas.add(target)

                if target not in occurring_sentences[source]:
                    occurring_sentences[source][target] = set()

                for sentence in sentences:
                    occurring_sentences[source][target].add(" ".join(sentence))

    graph = builder.build()
    graph.export("graph-adjacency.csv",
                 name_fn=lambda identifier: identifier.name())
    properties = Properties(parse.inflections, minimum, maximum, average,
                            cutoff,
                            len(included_lemmas) + len(excluded_lemmas),
                            included_lemmas, excluded_lemmas)

    if len(graph) > 0:
        return Termnet(graph, {LEFT: RankedGraph(graph)}, parse.inflections,
                       occurring_sentences, properties)
    else:
        empty = GraphBuilder(Graph.UNDIRECTED).build()
        inflections = Inflections()
        return Termnet(empty, {LEFT: RankedGraph(empty)}, inflections, {},
                       Properties(inflections))

    return net
Beispiel #6
0
    def compare_with(self, other):
        assert self.display_graph.kind == other.display_graph.kind
        assert len(self.ranked_graphs) == 1
        assert len(other.ranked_graphs) == 1
        builder = GraphBuilder(self.display_graph.kind)

        for node in self.display_graph.all_nodes:
            builder.add(node.identifier,
                        [d.identifier for d in node.descendants])

        for node in other.display_graph.all_nodes:
            builder.add(node.identifier,
                        [d.identifier for d in node.descendants])

        display_graph = builder.build()
        inflections = self.inflections.combine(other.inflections)
        occurring_sentences = {}

        for a, b_sentences in self.sentences.items():
            if a not in occurring_sentences:
                occurring_sentences[a] = {}

            for b, sentences in b_sentences.items():
                if b not in occurring_sentences[a]:
                    occurring_sentences[a][b] = set()

                for sentence in sentences:
                    occurring_sentences[a][b].add(sentence)

        for a, b_sentences in other.sentences.items():
            if a not in occurring_sentences:
                occurring_sentences[a] = {}

            for b, sentences in b_sentences.items():
                if b not in occurring_sentences[a]:
                    occurring_sentences[a][b] = set()

                for sentence in sentences:
                    occurring_sentences[a][b].add(sentence)

        included_lemmas = self.properties.included_lemmas.union(
            other.properties.included_lemmas)
        excluded_lemmas = self.properties.excluded_lemmas.union(
            other.properties.excluded_lemmas)

        for lemma in included_lemmas:
            excluded_lemmas.discard(lemma)

        properties = Properties(
            inflections,
            min(self.properties.minimum_cooccurrence_count,
                other.properties.minimum_cooccurrence_count),
            max(self.properties.maximum_cooccurrence_count,
                other.properties.maximum_cooccurrence_count),
            (self.properties.average_cooccurrence_count +
             other.properties.average_cooccurrence_count) / 2.0,
            self.properties.cutoff_cooccurrence_count
            if self.properties.cutoff_cooccurrence_count
            == other.properties.cutoff_cooccurrence_count else None,
            len(display_graph), included_lemmas, excluded_lemmas)
        return Termnet(
            display_graph, {
                LEFT: [silly for silly in self.ranked_graphs.values()][0],
                RIGHT: [silly for silly in other.ranked_graphs.values()][0]
            }, inflections, occurring_sentences, properties)