def test_cc_none_undirected(self): gb = GraphBuilder(Graph.UNDIRECTED) graph = gb.add("bobo", ["jack", "jill", "jane"]) \ .build() self.assertEqual(graph.clustering_coefficients["bobo"], 0.0) self.assertEqual(graph.clustering_coefficients["jack"], 0.0) self.assertEqual(graph.clustering_coefficients["jill"], 0.0) self.assertEqual(graph.clustering_coefficients["jane"], 0.0) # Testing clustering coefficient expansion self.assertEqual(graph.neighbourhood("bobo", 0, False), set()) self.assertEqual(graph.neighbourhood("bobo", 0, False, (1, 1.0)), set([("jack", 1), ("jill", 1), ("jane", 1)])) self.assertEqual(graph.neighbourhood("jack", 0, False), set()) self.assertEqual(graph.neighbourhood("jack", 0, False, (1, 1.0)), set([("bobo", 1)])) self.assertEqual(graph.neighbourhood("jill", 0, False), set()) self.assertEqual(graph.neighbourhood("jill", 0, False, (1, 1.0)), set([("bobo", 1)])) self.assertEqual(graph.neighbourhood("jane", 0, False), set()) self.assertEqual(graph.neighbourhood("jane", 0, False, (1, 1.0)), set([("bobo", 1)]))
def test_page_rank_biases(self): gb = GraphBuilder(Graph.UNDIRECTED) graph = gb.add("bobo", ["jack", "jill", "jane"]) \ .add("jack", ["colt"]) \ .add("jack", ["colt"]) \ .add("jane", ["bobo"]) \ .add("alik", ["peny"]) \ .build() page_rank = graph.page_rank(biases={"jack": .5}) self.assertTrue(math.isclose(page_rank["bobo"], 0.256, abs_tol=0.001), page_rank["bobo"]) self.assertTrue(math.isclose(page_rank["jack"], 0.174, abs_tol=0.001), page_rank["jack"]) self.assertTrue(math.isclose(page_rank["jill"], 0.093, abs_tol=0.001), page_rank["jill"]) self.assertTrue(math.isclose(page_rank["jane"], 0.093, abs_tol=0.001), page_rank["jane"]) self.assertTrue(math.isclose(page_rank["colt"], 0.096, abs_tol=0.001), page_rank["jane"]) self.assertTrue(math.isclose(page_rank["alik"], 0.142, abs_tol=0.001), page_rank["alik"]) self.assertTrue(math.isclose(page_rank["peny"], 0.142, abs_tol=0.001), page_rank["peny"])
def test_page_rank_directed(self): gb = GraphBuilder(Graph.DIRECTED) graph = gb.add("bobo", ["jack", "jill", "jane"]) \ .add("jack", ["colt"]) \ .add("jack", ["colt"]) \ .add("jane", ["bobo"]) \ .add("alik", ["peny"]) \ .build() page_rank = graph.page_rank() self.assertTrue(math.isclose(page_rank["bobo"], 0.190, abs_tol=0.001), page_rank["bobo"]) self.assertTrue(math.isclose(page_rank["jack"], 0.131, abs_tol=0.001), page_rank["jack"]) self.assertTrue(math.isclose(page_rank["jill"], 0.131, abs_tol=0.001), page_rank["jill"]) self.assertTrue(math.isclose(page_rank["jane"], 0.131, abs_tol=0.001), page_rank["jane"]) self.assertTrue(math.isclose(page_rank["colt"], 0.190, abs_tol=0.001), page_rank["jane"]) self.assertTrue(math.isclose(page_rank["alik"], 0.078, abs_tol=0.001), page_rank["alik"]) self.assertTrue(math.isclose(page_rank["peny"], 0.144, abs_tol=0.001), page_rank["peny"])
def test_graph_builder_undirected(self): gb = GraphBuilder(Graph.UNDIRECTED) graph = gb.add("bobo", ["jack", "jill", "jane"]) \ .add("jack", ["colt"]) \ .add("jack", ["colt"]) \ .add("alik", ["peny"]) \ .build() self.assertEqual(graph.links(), set([UndirectedLink("bobo", "jack"), UndirectedLink("bobo", "jill"), \ UndirectedLink("bobo", "jane"), UndirectedLink("jack", "colt"), UndirectedLink("alik", "peny")])) self.assertEqual(graph.global_max_distance(), 3) self.assertEqual( graph._max_distances, { "bobo": 2, "jack": 2, "jill": 3, "jane": 3, "colt": 3, "alik": 1, "peny": 1, }) self.assertEqual( graph._distances, { "bobo": { "bobo": 0, "jack": 1, "jill": 1, "jane": 1, "colt": 2, "alik": None, "peny": None, }, "jack": { "bobo": 1, "jack": 0, "jill": 2, "jane": 2, "colt": 1, "alik": None, "peny": None, }, "jill": { "bobo": 1, "jack": 2, "jill": 0, "jane": 2, "colt": 3, "alik": None, "peny": None, }, "jane": { "bobo": 1, "jack": 2, "jill": 2, "jane": 0, "colt": 3, "alik": None, "peny": None, }, "colt": { "bobo": 2, "jack": 1, "jill": 3, "jane": 3, "colt": 0, "alik": None, "peny": None, }, "alik": { "bobo": None, "jack": None, "jill": None, "jane": None, "colt": None, "alik": 0, "peny": 1, }, "peny": { "bobo": None, "jack": None, "jill": None, "jane": None, "colt": None, "alik": 1, "peny": 0, }, })
def build(input_text, input_format, window, separator, keep): check.check_iterable(input_text) assert window > 0, window assert keep >= 0 and keep <= 100, keep parse = workbench.parser.parse_input(input_text, input_format, window, separator) builder = GraphBuilder(Graph.UNDIRECTED) count_histogram = {} for subd in parse.cooccurrences.values(): for term_sentences in subd.values(): if len(term_sentences) not in count_histogram: count_histogram[len(term_sentences)] = 0 count_histogram[len(term_sentences)] += 1 logging.debug("count_histogram: %s" % count_histogram) sub_lengths = [[] if len(subd) == 0 else [len(l) for l in subd.values()] for subd in parse.cooccurrences.values()] if len(sub_lengths) > 0: maximum = max([0 if len(l) == 0 else max(l) for l in sub_lengths]) minimum = min([0 if len(l) == 0 else min(l) for l in sub_lengths]) average = sum([i for l in sub_lengths for i in l]) / sum([len(l) for l in sub_lengths]) else: maximum = 0 minimum = 0 average = 0.0 bottom_percent = (100.0 - keep) / 100.0 cutoff = max(int(maximum * bottom_percent), 1) logging.debug("maximum: %s, cutoff: %s" % (maximum, cutoff)) occurring_sentences = {} excluded_lemmas = set() included_lemmas = set() for source, target_sentences in sorted(parse.cooccurrences.items()): #source = parse.inflections.to_dominant_inflection(term_a) excluded_lemmas.add(source) targets = { target: sentences for target, sentences in filter( lambda item: len(item[1]) >= cutoff, target_sentences.items()) } #targets = {parse.inflections.to_dominant_inflection(term_b): sentences for term_b, sentences in filter(lambda item: len(item[1]) >= cutoff, term_sentences.items())} if len(targets) > 0: builder.add(source, [t for t in targets.keys()]) included_lemmas.add(source) excluded_lemmas.remove(source) if source not in occurring_sentences: occurring_sentences[source] = {} for target, sentences in targets.items(): included_lemmas.add(target) if target not in occurring_sentences[source]: occurring_sentences[source][target] = set() for sentence in sentences: occurring_sentences[source][target].add(" ".join(sentence)) graph = builder.build() graph.export("graph-adjacency.csv", name_fn=lambda identifier: identifier.name()) properties = Properties(parse.inflections, minimum, maximum, average, cutoff, len(included_lemmas) + len(excluded_lemmas), included_lemmas, excluded_lemmas) if len(graph) > 0: return Termnet(graph, {LEFT: RankedGraph(graph)}, parse.inflections, occurring_sentences, properties) else: empty = GraphBuilder(Graph.UNDIRECTED).build() inflections = Inflections() return Termnet(empty, {LEFT: RankedGraph(empty)}, inflections, {}, Properties(inflections)) return net
def compare_with(self, other): assert self.display_graph.kind == other.display_graph.kind assert len(self.ranked_graphs) == 1 assert len(other.ranked_graphs) == 1 builder = GraphBuilder(self.display_graph.kind) for node in self.display_graph.all_nodes: builder.add(node.identifier, [d.identifier for d in node.descendants]) for node in other.display_graph.all_nodes: builder.add(node.identifier, [d.identifier for d in node.descendants]) display_graph = builder.build() inflections = self.inflections.combine(other.inflections) occurring_sentences = {} for a, b_sentences in self.sentences.items(): if a not in occurring_sentences: occurring_sentences[a] = {} for b, sentences in b_sentences.items(): if b not in occurring_sentences[a]: occurring_sentences[a][b] = set() for sentence in sentences: occurring_sentences[a][b].add(sentence) for a, b_sentences in other.sentences.items(): if a not in occurring_sentences: occurring_sentences[a] = {} for b, sentences in b_sentences.items(): if b not in occurring_sentences[a]: occurring_sentences[a][b] = set() for sentence in sentences: occurring_sentences[a][b].add(sentence) included_lemmas = self.properties.included_lemmas.union( other.properties.included_lemmas) excluded_lemmas = self.properties.excluded_lemmas.union( other.properties.excluded_lemmas) for lemma in included_lemmas: excluded_lemmas.discard(lemma) properties = Properties( inflections, min(self.properties.minimum_cooccurrence_count, other.properties.minimum_cooccurrence_count), max(self.properties.maximum_cooccurrence_count, other.properties.maximum_cooccurrence_count), (self.properties.average_cooccurrence_count + other.properties.average_cooccurrence_count) / 2.0, self.properties.cutoff_cooccurrence_count if self.properties.cutoff_cooccurrence_count == other.properties.cutoff_cooccurrence_count else None, len(display_graph), included_lemmas, excluded_lemmas) return Termnet( display_graph, { LEFT: [silly for silly in self.ranked_graphs.values()][0], RIGHT: [silly for silly in other.ranked_graphs.values()][0] }, inflections, occurring_sentences, properties)