def test_graph_pattern_canonicalization(): # test for bug in lib: # rdflib.compare.to_canonical_graph(g) sometimes collapses distinct bnodes # see https://github.com/RDFLib/rdflib/issues/494 # The GraphPattern below causes such a problem, currently we return gp # itself instead of a canonical representation of it. We just test the len # in case it's fixed in rdflib. gp = GraphPattern( ((SOURCE_VAR, Variable('vcb0'), TARGET_VAR), (SOURCE_VAR, Variable('vrBYUk8'), TARGET_VAR), (TARGET_VAR, Variable('vrBYUk8'), SOURCE_VAR), (TARGET_VAR, Variable('vrvGapn'), SOURCE_VAR))) cgp = canonicalize(gp) assert len(gp) == len(cgp) # test for a bug in canonicalization when it didn't rewrite fixed gp = GraphPattern(( (TARGET_VAR, Variable('v0'), SOURCE_VAR), (TARGET_VAR, Variable('v0'), Variable('v1')), (TARGET_VAR, Variable('v2'), Variable('v1')), (TARGET_VAR, Variable('v2'), Variable('v3')), (TARGET_VAR, Variable('v4'), Variable('v5')), )) cgp = canonicalize(gp) assert len(gp) == len(cgp)
def load_results(fn): logger.info('loading results from: %s', fn) with gzip.open(fn) as f: res = json.load(f) result_patterns = [ (GraphPattern.from_dict(pattern_run['graph_pattern']), pattern_run['found_in_run']) for pattern_run in res['patterns'] ] coverage_counts = Counter({ (decurify(s), decurify(t)): c for (s, t), c in res.get('coverage_counts', []) }) gtp_scores = None gtps = [tuple(gtp) for gtp in res.get('ground_truth_pairs')] if gtps: coverage_max_precision = res.get('overall_coverage_max_precision', []) if not coverage_max_precision: # final result file for example coverage_max_precision = res.get('coverage_max_precision', []) gtp_scores = GTPScores(gtps) gtp_scores.gtp_max_precisions = OrderedDict([ ((decurify(s), decurify(t)), mp) for (s, t), mp in coverage_max_precision ]) logger.info('loaded %d result patterns', len(result_patterns)) return result_patterns, coverage_counts, gtp_scores
def test_predict_query(): source = URIRef('http://dbpedia.org/resource/Algebra') target = URIRef('http://dbpedia.org/resource/Mathematics') gp = GraphPattern([(SOURCE_VAR, wpl, TARGET_VAR), (TARGET_VAR, wpl, SOURCE_VAR), (TARGET_VAR, a, Variable('target_type'))]) t, res = predict_query(sparql, timeout, gp, source) assert len(res) > 0 assert target in res
def test_mutate_fix_var(): # tests on a small subset ground_truth_pairs_ = [ (dbp['Armour'], dbp['Knight']), (dbp['Barrel'], dbp['Wine']), (dbp['Barrister'], dbp['Law']), (dbp['Barrister'], dbp['Lawyer']), (dbp['Beak'], dbp['Bird']), (dbp['Beetroot'], dbp['Red']), (dbp['Belief'], dbp['Religion']), (dbp['Blanket'], dbp['Bed']), (dbp['Boot'], dbp['Shoe']), (dbp['Brine'], dbp['Salt']), ] v = Variable('v') gtp_scores_ = GTPScores(ground_truth_pairs_) gp = GraphPattern([ (SOURCE_VAR, v, TARGET_VAR), ]) tgps = mutate_fix_var(sparql, timeout, gtp_scores_, gp) assert tgps for tgp in tgps: logger.info(tgp.to_sparql_select_query()) assert gp != tgp assert v not in tgp.vars_in_graph gp = GraphPattern([ (SOURCE_VAR, v, TARGET_VAR), (SOURCE_VAR, a, Variable('source_type')), (TARGET_VAR, a, URIRef('http://schema.org/Country')), ]) tgps = mutate_fix_var(sparql, timeout, gtp_scores_, gp, rand_var=v) assert tgps for tgp in tgps: logger.info(tgp.to_sparql_select_query()) assert gp == tgp, 'should not have found any substitution' ground_truth_pairs_.append((dbp['Berlin'], dbp['Germany'])) gtp_scores_ = GTPScores(ground_truth_pairs_) tgps = mutate_fix_var(sparql, timeout, gtp_scores_, gp) assert tgps for tgp in tgps: logger.info(tgp.to_sparql_select_query()) assert gp != tgp, 'should have found a substitution' assert gp.vars_in_graph - tgp.vars_in_graph
def test_variable_substitution_query(): source_target_pairs = [ (URIRef('http://dbpedia.org/resource/Adolescence'), URIRef('http://dbpedia.org/resource/Youth')), (URIRef('http://dbpedia.org/resource/Adult'), URIRef('http://dbpedia.org/resource/Child')), (URIRef('http://dbpedia.org/resource/Affinity_(law)'), URIRef('http://dbpedia.org/resource/Mother')), (URIRef('http://dbpedia.org/resource/Alchemy'), URIRef('http://dbpedia.org/resource/Gold')), (URIRef('http://dbpedia.org/resource/Alderman'), URIRef('http://dbpedia.org/resource/Mayor')), (URIRef('http://dbpedia.org/resource/Algebra'), URIRef('http://dbpedia.org/resource/Mathematics')), (URIRef('http://dbpedia.org/resource/Amen'), URIRef('http://dbpedia.org/resource/Prayer')), (URIRef('http://dbpedia.org/resource/Amnesia'), URIRef('http://dbpedia.org/resource/Memory')), (URIRef('http://dbpedia.org/resource/Angel'), URIRef('http://dbpedia.org/resource/Heaven')), (URIRef('http://dbpedia.org/resource/Arithmetic'), URIRef('http://dbpedia.org/resource/Mathematics')), ] gp = GraphPattern([ (SOURCE_VAR, Variable('edge'), TARGET_VAR), (SOURCE_VAR, a, Variable('source_type')), (TARGET_VAR, a, Variable('target_type')), ]) limit = MUTPB_FV_QUERY_LIMIT t, res = variable_substitution_query(sparql, timeout, gp, Variable('edge'), source_target_pairs, limit) logger.debug(res.most_common()) assert res.most_common()[0][0] == wpl, 2 gp = GraphPattern([ (Variable('var'), wpl, SOURCE_VAR), ]) t, res = variable_substitution_query(sparql, timeout, gp, Variable('var'), source_target_pairs, limit) logger.debug(res.most_common()) assert (URIRef('http://dbpedia.org/resource/Human'), 3) in res.most_common()
def test_evaluate(): gp = GraphPattern(( (SOURCE_VAR, wikilink, TARGET_VAR), (SOURCE_VAR, a, URIRef('http://dbpedia.org/ontology/PopulatedPlace')), (TARGET_VAR, a, URIRef('http://schema.org/Country')) )) res = evaluate( sparql, timeout, gtp_scores, gp) # (655, 0.4048, 0.4048, 0.0089, 7.5, 3, 3, 2, 0, 0.1936) # (remains, score, gain, f_measure, avg_reslens, gt_matches, # patlen, patvars, timeout, qtime) update_individuals([gp], [res]) fitness = gp.fitness.values matching_node_pairs = gp.matching_node_pairs gtp_precisions = gp.gtp_precisions gp.matching_node_pairs = matching_node_pairs logger.info(gp.matching_node_pairs) assert fitness.remains == len(ground_truth_pairs), 'remains wrong?' assert fitness.gt_matches == 3, "didn't match 3 gt pairs?" score = fitness.score assert 0 < score < 0.5, 'score not correct?' assert score == fitness.gain, 'score and gain should be the same here' assert 0 < fitness.f_measure < 0.1, \ 'f1 measure not correct?' assert fitness.patlen == 3, 'pattern should have 3 triples' assert fitness.patvars == 2, 'pattern should have 2 vars' if not query_time_soft_exceeded(fitness.qtime, timeout): assert 0 < fitness.avg_reslens < 10, \ 'avg match count should be ~7.5' assert fitness.timeout == 0, 'should not be a timeout' else: assert 0 < fitness.avg_reslens < 15, \ 'avg match count out of bounds for timeout' assert fitness.timeout > 0, 'should be a timeout' assert isinstance(gtp_precisions, OrderedDict) assert list(gtp_precisions) == matching_node_pairs logger.info(gtp_precisions) assert sum(gtp_precisions.values()) == fitness.gain, \ 'sum of precisions should be gain in this case'
def test_evaluate(): gp = GraphPattern( ((SOURCE_VAR, wikilink, TARGET_VAR), (SOURCE_VAR, a, URIRef('http://dbpedia.org/ontology/PopulatedPlace')), (TARGET_VAR, a, URIRef('http://schema.org/Country')))) res = evaluate(sparql, timeout, gtp_scores, gp) # (655, 0.4048, 0.4048, 0.0089, 7.5, 3, 3, 2, 0, 0.1936) # (remains, score, gain, f_measure, avg_reslens, gt_matches, # patlen, patvars, timeout, qtime) update_individuals([gp], [res]) fitness = gp.fitness.values matching_node_pairs = gp.matching_node_pairs gtp_precisions = gp.gtp_precisions gp.matching_node_pairs = matching_node_pairs logger.info(gp.matching_node_pairs) assert fitness.remains == len(ground_truth_pairs), 'remains wrong?' assert fitness.gt_matches == 3, "didn't match 3 gt pairs?" score = fitness.score assert 0 < score < 0.5, 'score not correct?' assert score == fitness.gain, 'score and gain should be the same here' assert 0 < fitness.f_measure < 0.1, \ 'f1 measure not correct?' assert fitness.patlen == 3, 'pattern should have 3 triples' assert fitness.patvars == 2, 'pattern should have 2 vars' if not query_time_soft_exceeded(fitness.qtime, timeout): assert 0 < fitness.avg_reslens < 10, \ 'avg match count should be ~7.5' assert fitness.timeout == 0, 'should not be a timeout' else: assert 0 < fitness.avg_reslens < 15, \ 'avg match count out of bounds for timeout' assert fitness.timeout > 0, 'should be a timeout' assert isinstance(gtp_precisions, OrderedDict) assert list(gtp_precisions) == matching_node_pairs logger.info(gtp_precisions) assert sum(gtp_precisions.values()) == fitness.gain, \ 'sum of precisions should be gain in this case'
def test_timeout_pattern(): u = URIRef('http://dbpedia.org/resource/Template:Reflist') wpdisambig = URIRef('http://dbpedia.org/ontology/wikiPageDisambiguates') gp = GraphPattern([ (SOURCE_VAR, Variable('v1'), u), (SOURCE_VAR, Variable('v5'), u), (TARGET_VAR, Variable('v0'), u), (TARGET_VAR, Variable('v3'), u), (TARGET_VAR, Variable('v6'), Variable('v2')), (Variable('v4'), wpdisambig, TARGET_VAR), ]) res = evaluate(sparql, timeout, gtp_scores, gp) update_individuals([gp], [res]) fitness = gp.fitness.values matching_node_pairs = gp.matching_node_pairs gp.matching_node_pairs = matching_node_pairs logger.info(gp.matching_node_pairs) assert query_time_soft_exceeded(fitness.qtime, timeout) assert fitness.score == 0 if query_time_hard_exceeded(fitness.qtime, timeout): assert fitness.f_measure == 0 else: assert fitness.f_measure > 0
def test_timeout_pattern(): u = URIRef('http://dbpedia.org/resource/Template:Reflist') wpdisambig = URIRef('http://dbpedia.org/ontology/wikiPageDisambiguates') gp = GraphPattern([ (SOURCE_VAR, Variable('v1'), u), (SOURCE_VAR, Variable('v5'), u), (TARGET_VAR, Variable('v0'), u), (TARGET_VAR, Variable('v3'), u), (TARGET_VAR, Variable('v6'), Variable('v2')), (Variable('v4'), wpdisambig, TARGET_VAR), ]) res = evaluate( sparql, timeout, gtp_scores, gp) update_individuals([gp], [res]) fitness = gp.fitness.values matching_node_pairs = gp.matching_node_pairs gp.matching_node_pairs = matching_node_pairs logger.info(gp.matching_node_pairs) assert query_time_soft_exceeded(fitness.qtime, timeout) assert fitness.score == 0 if query_time_hard_exceeded(fitness.qtime, timeout): assert fitness.f_measure == 0 else: assert fitness.f_measure > 0
def test_graph_pattern_connectedness(): # test edge var connections gp = GraphPattern([ (SOURCE_VAR, Variable('p'), Variable('v1')), (TARGET_VAR, Variable('p'), Variable('v2')), ]) assert not gp.is_connected(), \ "shouldn't be connected with nodes only: %s" % (gp,) assert gp.is_connected(via_edges=True), \ "should be connected via edges: %s" % (gp,) gp = GraphPattern([ (SOURCE_VAR, Variable('p'), Variable('v1')), (Variable('p'), Variable('v2'), TARGET_VAR), ]) assert not gp.is_connected(), \ "shouldn't be connected with nodes only: %s" % (gp,) assert gp.is_connected(via_edges=True), \ "should be connected via edges: %s" % (gp,)
def main(): from rdflib import Variable gp = GraphPattern(( (SOURCE_VAR, Variable('v1'), Variable('v2')), (TARGET_VAR, Variable('v3'), Variable('v2')), )) # get list of semantic association pairs and split in train and test sets semantic_associations = get_semantic_associations( fn='data/dbpedia_random_1000k_uri_pairs.csv.gz', limit=None, ) # assocs_train, assocs_test = split_training_test_set( # semantic_associations # ) # stps = tuple(sorted(assocs_train)) stps = semantic_associations triples = generate_triples(gp, stps) load_triples_into_endpoint(triples)
def random_path(length): """Returns a random path with given length between source and target. Paths look like: (?source, ?ve1, ?vn1), (?vn1, ?ve2, ?vn2), ... (?vn(l-1), ?vel, ?target) As every edge can be flipped randomly. """ assert length > 0 edges = [Variable('ve%d' % i) for i in range(1, length + 1)] nodes = [Variable('vn%d' % i) for i in range(1, length)] + [TARGET_VAR] s = SOURCE_VAR # start at source triples = [] for e, n in zip(edges, nodes): triples.append((s, e, n)) s = n gp = GraphPattern([(o, p, s) if random.random() < .5 else (s, p, o) for s, p, o in triples]) return gp
def simple_paths(length): """Returns all paths with given length between source and target. Paths look like: (?source, ?ve1, ?vn1), (?vn1, ?ve2, ?vn2), ... (?vn(l-1), ?vel, ?target) As every edge can be flipped, there are 2**length returned paths. """ assert length > 0 edges = [Variable('ve%d' % i) for i in range(1, length + 1)] nodes = [Variable('vn%d' % i) for i in range(1, length)] + [TARGET_VAR] s = SOURCE_VAR # start at source triples = [] for e, n in zip(edges, nodes): triples.append((s, e, n)) s = n for n, edges_to_flip in enumerate(powerset(range(length))): gp = GraphPattern([(o, p, s) if i in edges_to_flip else (s, p, o) for i, (s, p, o) in enumerate(triples)]) yield n, gp
def main(): from rdflib import Variable # the following triple will timeout if vars_joint was 0: # ?s a owl:Thing . t? a owl:Thing . gp = GraphPattern(( (SOURCE_VAR, Variable('v1'), Variable('v2')), (TARGET_VAR, Variable('v3'), Variable('v2')), )) # get list of semantic association pairs and split in train and test sets semantic_associations = get_semantic_associations( fn='data/dbpedia_random_1000_uri_pairs.csv.gz', limit=100, ) # assocs_train, assocs_test = split_training_test_set( # semantic_associations # ) # stps = tuple(sorted(assocs_train)) stps = semantic_associations print(len(stps)) triples = generate_triples(gp, stps) for t in triples: print(t)
def test_simplify_pattern(): gp = GraphPattern([(SOURCE_VAR, wikilink, TARGET_VAR)]) res = mutate_simplify_pattern(gp) assert gp == res, 'should not simplify simple pattern' # test parallel single var edges gp_bloated = gp + [ (SOURCE_VAR, Variable('v1'), TARGET_VAR), ] res = mutate_simplify_pattern(gp_bloated) assert res == gp, 'not simplified:\n%s' % res.to_sparql_select_query() gp_bloated += [ (SOURCE_VAR, Variable('v2'), TARGET_VAR), ] res = mutate_simplify_pattern(gp_bloated) assert res == gp, 'not simplified:\n%s' % res.to_sparql_select_query() # test edges between fixed nodes gp += [ (SOURCE_VAR, wikilink, dbp['City']), (TARGET_VAR, wikilink, dbp['Country']), ] gp_bloated = gp + [ (dbp['City'], wikilink, dbp['Country']), (dbp['Country'], Variable('v2'), dbp['City']), ] res = mutate_simplify_pattern(gp_bloated) assert res == gp, 'not simplified:\n%s' % res.to_sparql_select_query() # test unrestricting leaves: gp_bloated = gp + [ (SOURCE_VAR, Variable('v3'), Variable('v4')), ] res = mutate_simplify_pattern(gp_bloated) assert res == gp, 'not simplified:\n%s' % res.to_sparql_select_query() gp_bloated = gp + [ (SOURCE_VAR, Variable('v3'), Variable('v4')), (Variable('v5'), Variable('v6'), Variable('v4')), ] res = mutate_simplify_pattern(gp_bloated) assert res == gp, 'not simplified:\n%s' % res.to_sparql_select_query() gp_bloated = gp + [ (SOURCE_VAR, Variable('v3'), Variable('v4')), (Variable('v5'), Variable('v6'), Variable('v4')), (Variable('v4'), Variable('v7'), Variable('v8')), ] res = mutate_simplify_pattern(gp_bloated) assert res == gp, 'not simplified:\n%s' % res.to_sparql_select_query() # test leaves behind fixed nodes gp += [ (SOURCE_VAR, wikilink, Variable('v4')), ] gp_bloated = gp + [ (Variable('v5'), wikilink, dbp['Country']), (Variable('v5'), Variable('v6'), Variable('v7')), ] res = mutate_simplify_pattern(gp_bloated) assert res == gp, 'not simplified:\n%s' % res.to_sparql_select_query() # counter example of an advanced but restricting pattern: gp = gp + [ (SOURCE_VAR, Variable('v3'), Variable('v4')), (Variable('v5'), Variable('v6'), Variable('v4')), (Variable('v4'), Variable('v7'), Variable('v8')), (TARGET_VAR, Variable('v3'), SOURCE_VAR), (dbp['City'], Variable('v6'), dbp['Country']), (dbp['Country'], Variable('v8'), dbp['City']), ] res = mutate_simplify_pattern(gp) assert res == gp, 'was simplified (bad):\n%s' % res.to_sparql_select_query( ) # test atomic patterns: gp = GraphPattern([(SOURCE_VAR, Variable('v1'), Variable('v2'))]) res = mutate_simplify_pattern(gp) assert res == gp, 'was simplified (bad):\n%s' % res.to_sparql_select_query( ) gp = GraphPattern([ (SOURCE_VAR, Variable('v1'), Variable('v2')), (SOURCE_VAR, Variable('v3'), Variable('v4')), ]) res = mutate_simplify_pattern(gp) assert res == gp, 'was simplified (bad):\n%s' % res.to_sparql_select_query( ) # test edge var connections gp = GraphPattern([ (SOURCE_VAR, Variable('p'), Variable('v1')), (TARGET_VAR, Variable('p'), Variable('v2')), ]) res = mutate_simplify_pattern(gp) assert res == gp, 'was simplified (bad):\n%s\nto\n%s' % (gp, res) gp2 = gp + [ (Variable('v1'), Variable('v3'), Variable('v4')), ] res = mutate_simplify_pattern(gp2) assert res == gp, 'not simplified:\n%s\nto\n%s' % (gp2, res) gp = GraphPattern([ (SOURCE_VAR, Variable('p'), Variable('v1')), (Variable('p'), Variable('v2'), TARGET_VAR), ]) res = mutate_simplify_pattern(gp) assert res == gp, 'was simplified (bad):\n%s\nto\n%s' % (gp, res) gp2 = gp + [ (Variable('p'), Variable('v3'), TARGET_VAR), ] res = mutate_simplify_pattern(gp2) assert res == gp, 'not simplified:\n%s\nto\n%s' % (gp2, res)
def test_mutate_increase_dist(): gp = GraphPattern([(SOURCE_VAR, wikilink, TARGET_VAR)]) res = mutate_increase_dist(gp) assert gp != res assert gp.diameter() + 1 == res.diameter() assert gp.vars_in_graph == {SOURCE_VAR, TARGET_VAR}
def _dejsonify(pattern_str): return GraphPattern( [tuple([Variable(i) for i in t]) for t in json.loads(pattern_str)])
def pattern_generator( length, loops=True, node_edge_joint=True, p_only_connected=True, source_target_edges=True, exclude_isomorphic=True, count_candidates_only=False, ): assert not source_target_edges or node_edge_joint, \ 'source_target_edges cannot be used without node_edge_joint' canonicalized_patterns = {} if node_edge_joint: # To be connected there are max 3 + 2 + 2 + 2 + ... vars for triples. # The first can be 3 different ones (including ?source and ?target, then # in each of the following triples at least one var has to be an old one possible_vars = [Variable('v%d' % i) for i in range((2 * length) - 1)] possible_nodes = possible_vars + [SOURCE_VAR, TARGET_VAR] if source_target_edges: possible_edges = possible_nodes else: possible_edges = possible_vars else: possible_var_nodes = [Variable('n%d' % i) for i in range(length - 1)] possible_nodes = possible_var_nodes + [SOURCE_VAR, TARGET_VAR] possible_edges = [Variable('e%d' % i) for i in range(length)] possible_triples = [ (s, p, o) for s in possible_nodes for p in possible_edges for o in possible_nodes ] n_patterns = binom(len(possible_triples), length) logger.info( 'generating %d possible patterns of length %d', n_patterns, length) if count_candidates_only: yield (n_patterns, None) return i = 0 pid = 0 for pid, pattern in enumerate(combinations(possible_triples, length)): gp = GraphPattern(pattern) # check that source and target are in gp: if not gp.complete(): logger.debug( 'excluded %d: source or target missing: %s', pid, gp) continue nodes = sorted(gp.nodes - {SOURCE_VAR, TARGET_VAR}) edges = sorted(gp.edges - {SOURCE_VAR, TARGET_VAR}) vars_ = sorted(gp.vars_in_graph - {SOURCE_VAR, TARGET_VAR}) # check there are no skipped variables (nodes or edges) # noinspection PyUnboundLocalVariable if ( (node_edge_joint and vars_ != possible_vars[:len(vars_)]) or (not node_edge_joint and ( nodes != possible_var_nodes[:len(nodes)] or edges != possible_edges[:len(edges)] )) ): logger.debug('excluded %d: skipped var: %s', pid, gp) continue # check if nodes and edges are disjoint if not node_edge_joint and (gp.nodes & gp.edges): logger.debug('excluded %d: node-edge-joined: %s', pid, gp) continue # check for loops if necessary if not loops and any([s == o for s, p, o in gp]): logger.debug('excluded %d: loop: %s', pid, gp) continue # check that the pattern is connected if not gp.is_connected(via_edges=p_only_connected): logger.debug('excluded %d: not connected:\n%s', pid, gp) continue # exclude patterns which are isomorphic to already generated ones if exclude_isomorphic: cgp = canonicalize(gp) if cgp in canonicalized_patterns: logger.debug( 'excluded %d: isomorphic to %d:\n%sand\n%s', pid, canonicalized_patterns[cgp][0], gp, canonicalized_patterns[cgp][1] ) continue else: canonicalized_patterns[cgp] = (pid, gp) gp = cgp i += 1 logger.debug('generated pattern %d: %s', pid, gp) yield pid, gp assert pid + 1 == n_patterns logger.info( 'found %d differing patterns out of %d possible of length %d', i, n_patterns, length ) yield (n_patterns, None)
def pattern_generator( length, loops=True, node_edge_joint=True, p_only_connected=True, source_target_edges=True, exclude_isomorphic=True, count_candidates_only=False, ): assert not source_target_edges or node_edge_joint, \ 'source_target_edges cannot be used without node_edge_joint' canonicalized_patterns = {} if node_edge_joint: # To be connected there are max 3 + 2 + 2 + 2 + ... vars for triples. # The first can be 3 different ones (including ?source and ?target, then # in each of the following triples at least one var has to be an old one possible_vars = [Variable('v%d' % i) for i in range((2 * length) - 1)] possible_nodes = possible_vars + [SOURCE_VAR, TARGET_VAR] if source_target_edges: possible_edges = possible_nodes else: possible_edges = possible_vars else: possible_var_nodes = [Variable('n%d' % i) for i in range(length - 1)] possible_nodes = possible_var_nodes + [SOURCE_VAR, TARGET_VAR] possible_edges = [Variable('e%d' % i) for i in range(length)] possible_triples = [(s, p, o) for s in possible_nodes for p in possible_edges for o in possible_nodes] n_patterns = binom(len(possible_triples), length) logger.info('generating %d possible patterns of length %d', n_patterns, length) if count_candidates_only: yield (n_patterns, None) return i = 0 pid = 0 for pid, pattern in enumerate(combinations(possible_triples, length)): gp = GraphPattern(pattern) # check that source and target are in gp: if not gp.complete(): logger.debug('excluded %d: source or target missing: %s', pid, gp) continue nodes = sorted(gp.nodes - {SOURCE_VAR, TARGET_VAR}) edges = sorted(gp.edges - {SOURCE_VAR, TARGET_VAR}) vars_ = sorted(gp.vars_in_graph - {SOURCE_VAR, TARGET_VAR}) # check there are no skipped variables (nodes or edges) # noinspection PyUnboundLocalVariable if ((node_edge_joint and vars_ != possible_vars[:len(vars_)]) or (not node_edge_joint and (nodes != possible_var_nodes[:len(nodes)] or edges != possible_edges[:len(edges)]))): logger.debug('excluded %d: skipped var: %s', pid, gp) continue # check if nodes and edges are disjoint if not node_edge_joint and (gp.nodes & gp.edges): logger.debug('excluded %d: node-edge-joined: %s', pid, gp) continue # check for loops if necessary if not loops and any([s == o for s, p, o in gp]): logger.debug('excluded %d: loop: %s', pid, gp) continue # check that the pattern is connected if not gp.is_connected(via_edges=p_only_connected): logger.debug('excluded %d: not connected:\n%s', pid, gp) continue # exclude patterns which are isomorphic to already generated ones if exclude_isomorphic: cgp = canonicalize(gp) if cgp in canonicalized_patterns: logger.debug('excluded %d: isomorphic to %d:\n%sand\n%s', pid, canonicalized_patterns[cgp][0], gp, canonicalized_patterns[cgp][1]) continue else: canonicalized_patterns[cgp] = (pid, gp) gp = cgp i += 1 logger.debug('generated pattern %d: %s', pid, gp) yield pid, gp assert pid + 1 == n_patterns logger.info('found %d differing patterns out of %d possible of length %d', i, n_patterns, length) yield (n_patterns, None)
def patterns( length, loops=True, node_edge_joint=True, p_only_connected=True, source_target_edges=True, exclude_isomorphic=True, count_candidates_only=False, ): """Takes a numerical pattern and generates actual patterns from it.""" assert not count_candidates_only or not exclude_isomorphic, \ 'count_candidates_only cannot be used with isomorphism check' assert not source_target_edges or node_edge_joint, \ 'source_target_edges cannot be used without node_edge_joint' canonicalized_patterns = {} pid = -1 for c, num_pat in enumerate( numerical_patterns( length, loops=loops, node_edge_joint=node_edge_joint, )): assert (len(num_pat)) == length, 'too short: %s' % (num_pat, ) flat_num_pat = [v for t in num_pat for v in t] all_numbers = set(flat_num_pat) if not p_only_connected: # Numerical patterns are always connected, but they might be # p_only_connected (e.g., 123 425). # Check that the pattern isn't p_only_connected, meaning that it's # also connected by nodes (e.g., 123 325). # Note that in case of node_edge_joint 123 245 is also considered # p_only_connected. if not nx.is_connected(to_nx_graph(num_pat)): logger.debug('excluded %d: not node connected:\n%s', c, num_pat) continue if source_target_edges: all_numbers = sorted(all_numbers) numbers = all_numbers else: numbers = sorted(all_numbers - set(flat_num_pat[1::3])) all_numbers = sorted(all_numbers) if count_candidates_only: l = len(numbers) perms = l * (l - 1) pid += perms # yield pid, None # way slower, rather show progress from here: if c % 100000 == 0: logger.info('pattern id: %d, vars: %d, permutations: %d', pid, l, perms) continue for s, t in permutations(numbers, 2): pid += 1 # source and target are mapped to numbers s and t # re-enumerate the leftover numbers to close "holes" leftover_numbers = [n for n in all_numbers if n != s and n != t] var_map = { n: Variable('v%d' % i) for i, n in enumerate(leftover_numbers) } var_map[s] = SOURCE_VAR var_map[t] = TARGET_VAR gp = GraphPattern( tuple([tuple([var_map[i] for i in trip]) for trip in num_pat])) assert len(gp) == length, \ 'gp too short: num %s\n%s' % (num_pat, gp) # exclude patterns which are isomorphic to already generated ones if exclude_isomorphic: cgp = canonicalize(gp) if cgp in canonicalized_patterns: igp = canonicalized_patterns[cgp] igp_numpat, igp_s, igp_t, igp_gp = igp logger.debug( 'excluded isomorphic %s with ?s=%d, ?t=%d:\n' 'isomorphic to %s with ?s=%d, ?t=%d:\n' '%sand\n%s', num_pat, s, t, igp_numpat, igp_s, igp_t, gp, igp_gp, ) continue else: canonicalized_patterns[cgp] = (num_pat, s, t, gp) gp = cgp yield pid, gp yield pid + 1, None
def test_graph_pattern(): g = Graph() g.add((URIRef('foo'), URIRef('bar'), Literal('bla'))) g.add((URIRef('foo'), URIRef('baa'), Literal('bla'))) g.add((URIRef('faa'), URIRef('boo'), Literal('blub'))) gp = GraphPattern(g) gp = gp.replace({ URIRef('foo'): Variable('a'), Literal('bla'): Variable('l'), }) sparql = gp.to_sparql_select_query() expected = 'SELECT ?a ?l WHERE {\n' \ ' ?a <baa> ?l .\n' \ ' ?a <bar> ?l .\n' \ ' <faa> <boo> "blub" .\n' \ '}\n' assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql) sparql = gp.to_sparql_select_query() assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql) gp2 = gp.replace({URIRef('baa'): Variable('b')}) sparql = gp2.to_sparql_select_query(bind={Variable('a'): URIRef('bound')}) expected = 'SELECT ?a ?b ?l WHERE {\n' \ ' ?a ?b ?l .\n' \ ' ?a <bar> ?l .\n' \ ' <faa> <boo> "blub" .\n' \ ' FILTER(\n' \ ' ?a=<bound>\n' \ ' )\n' \ '}\n' assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql) gp3 = GraphPattern(g, source_node=URIRef('foo'), target_node=Literal('bla')) expected = 'SELECT ?source ?target WHERE {\n' \ ' ?source <baa> ?target .\n' \ ' ?source <bar> ?target .\n' \ ' <faa> <boo> "blub" .\n' \ '}\n' sparql = gp3.to_sparql_select_query() assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql) gp4 = gp3.only_with([TARGET_VAR]) expected = 'SELECT ?source ?target WHERE {\n' \ ' ?source <baa> ?target .\n' \ ' ?source <bar> ?target .\n' \ '}\n' sparql = gp4.to_sparql_select_query() assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql) gp4_red = gp4.replace({URIRef('baa'): URIRef('bar')}) assert len(gp4) > len(gp4_red), \ "double edge should've been reduced: %s" % (gp4_red,) gp5 = gp3.only_with([URIRef('bar')]) expected = 'SELECT ?source ?target WHERE {\n' \ ' ?source <bar> ?target .\n' \ '}\n' sparql = gp5.to_sparql_select_query() assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql) gp6 = gp + gp2 expected = 'SELECT ?a ?b ?l WHERE {\n' \ ' ?a ?b ?l .\n' \ ' ?a <baa> ?l .\n' \ ' ?a <bar> ?l .\n' \ ' <faa> <boo> "blub" .\n' \ '}\n' sparql = gp6.to_sparql_select_query() assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql) gp7 = gp - gp2 expected = 'SELECT ?a ?l WHERE {\n' \ ' ?a <baa> ?l .\n' \ '}\n' sparql = gp7.to_sparql_select_query() assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql) gp8 = gp + ((TARGET_VAR, TARGET_VAR, TARGET_VAR), ) expected = 'SELECT ?a ?l ?target WHERE {\n' \ ' ?a <baa> ?l .\n' \ ' ?a <bar> ?l .\n' \ ' ?target ?target ?target .\n' \ ' <faa> <boo> "blub" .\n' \ '}\n' sparql = gp8.to_sparql_select_query() assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql) gp9 = gp - gp assert not bool(gp9), 'gp9 was not empty' gp9 = gp - list(gp) assert not bool(gp9), 'gp9 - list(gp9) was not empty' # test triples by identifier: tbi = gp8.triples_by_identifier() expected = { Variable('a'): { (Variable('a'), URIRef('baa'), Variable('l')), (Variable('a'), URIRef('bar'), Variable('l')), }, Variable('l'): { (Variable('a'), URIRef('baa'), Variable('l')), (Variable('a'), URIRef('bar'), Variable('l')), }, URIRef('baa'): { (Variable('a'), URIRef('baa'), Variable('l')), }, URIRef('bar'): { (Variable('a'), URIRef('bar'), Variable('l')), }, Variable('target'): { (Variable('target'), Variable('target'), Variable('target')), }, URIRef('faa'): { (URIRef('faa'), URIRef('boo'), Literal('blub')), }, URIRef('boo'): { (URIRef('faa'), URIRef('boo'), Literal('blub')), }, Literal('blub'): { (URIRef('faa'), URIRef('boo'), Literal('blub')), }, } assert tbi == expected, 'triples_by_identifier %s != %s' % (tbi, expected) tbn = gp8.triples_by_nodes({ Variable('a'), Variable('target'), URIRef('notthere'), URIRef('faa'), URIRef('boo') }) expected = { Variable('a'): { (Variable('a'), URIRef('baa'), Variable('l')), (Variable('a'), URIRef('bar'), Variable('l')), }, Variable('target'): { (Variable('target'), Variable('target'), Variable('target')), }, URIRef('faa'): { (URIRef('faa'), URIRef('boo'), Literal('blub')), }, URIRef('notthere'): set(), URIRef('boo'): set(), } assert tbn == expected, 'triples_by_nodes %s != %s' % (tbn, expected) tbe = gp8.triples_by_edges( {URIRef('baa'), Variable('a'), Variable('?target')}) expected = { URIRef('baa'): { (Variable('a'), URIRef('baa'), Variable('l')), }, Variable('target'): { (Variable('target'), Variable('target'), Variable('target')), }, Variable('a'): set(), } assert tbe == expected, 'triples_by_edges %s != %s' % (tbe, expected)
def test_graph_pattern_stats(): gp = GraphPattern( ( (URIRef('bar'), URIRef('pred1'), URIRef('s')), (URIRef('foo'), URIRef('pred2'), URIRef('t')), (URIRef('s'), URIRef('pred3'), URIRef('t')), ), source_node=URIRef('s'), target_node=URIRef('t'), ) gp1 = GraphPattern( ( (URIRef('bar'), URIRef('pred1'), URIRef('s2')), (URIRef('foo'), URIRef('pred2'), URIRef('t')), (URIRef('s2'), URIRef('pred3'), URIRef('t')), (URIRef('single'), URIRef('pred1'), URIRef('s2')), ), source_node=URIRef('s2'), target_node=URIRef('t'), ) gps = GraphPatternStats() gps.add_graph_pattern(gp, URIRef('s'), URIRef('t')) identifiers = set(gp.identifier_counts(True)) assert identifiers == { URIRef('bar'), URIRef('foo'), URIRef('pred1'), URIRef('pred2'), URIRef('pred3'), }, identifiers assert identifiers == set(gps.identifier_gt_node_count.keys()) assert identifiers == set(gps.identifier_gt_pair_count.keys()) assert set([(i, 1) for i in identifiers ]) == set(gps.identifier_gt_node_count.items()) assert set([(i, 1) for i in identifiers ]) == set(gps.identifier_gt_pair_count.items()) gps.add_graph_pattern(gp1, URIRef('s2'), URIRef('t')) assert set(gps.identifier_gt_node_count.keys()) == \ identifiers | {URIRef('single')} assert set(gps.identifier_gt_pair_count.keys()) == \ identifiers | {URIRef('single')} expected_node = { (URIRef('bar'), 2), (URIRef('foo'), 1), (URIRef('pred1'), 2), (URIRef('pred2'), 1), (URIRef('pred3'), 2), (URIRef('single'), 1), } res = set(gps.identifier_gt_node_count.items()) assert expected_node == res, res expected_pair = { (URIRef('bar'), 2), (URIRef('foo'), 2), (URIRef('pred1'), 2), (URIRef('pred2'), 2), (URIRef('pred3'), 2), (URIRef('single'), 1), } res = set(gps.identifier_gt_pair_count.items()) assert expected_pair == res, res tmp = gps.min_identifier_gt_node_occurrences(gp) assert tmp == 1, 'tmp: %d\n%s' % (tmp, gps) tmp = gps.min_identifier_gt_pair_occurrences(gp) assert tmp == 2, 'tmp: %d\n%s' % (tmp, gps)
def pattern_generator(length, loops=True, exclude_isomorphic=True): canonicalized_patterns = {} possible_var_nodes = [Variable('n%d' % i) for i in range(length - 1)] possible_nodes = possible_var_nodes + [SOURCE_VAR, TARGET_VAR] possible_edges = [Variable('e%d' % i) for i in range(length)] possible_triples = [ (s, p, o) for s in possible_nodes for p in possible_edges for o in possible_nodes ] n_patterns = binom(len(possible_triples), length) logger.info( 'generating %d possible patterns of length %d', n_patterns, length) i = 0 pid = 0 for pid, pattern in enumerate(combinations(possible_triples, length)): gp = GraphPattern(pattern) # check that source and target are in gp: if not gp.complete(): logger.debug( 'excluded %d: source or target missing: %s', pid, gp) continue nodes = sorted(gp.nodes - {SOURCE_VAR, TARGET_VAR}) edges = sorted(gp.edges) # check there are no skipped nodes, e.g., link to n2 picked but no n1 if nodes != possible_var_nodes[:len(nodes)]: logger.debug('excluded %d: skipped node: %s', pid, gp) continue if edges != possible_edges[:len(edges)]: logger.debug('excluded %d: skipped edge: %s', pid, gp) continue # check for loops if necessary if not loops and any([s == o for s, p, o in gp]): logger.debug('excluded %d: loop: %s', pid, gp) continue # check that the pattern is connected if not gp.is_connected(): logger.debug('excluded %d: not connected:\n%s', pid, gp) continue # exclude patterns which are isomorphic to already generated ones if exclude_isomorphic: cgp = canonicalize(gp) if cgp in canonicalized_patterns: logger.debug( 'excluded %d: isomorphic to %d:\n%sand\n%s', pid, canonicalized_patterns[cgp][0], gp, canonicalized_patterns[cgp][1] ) continue else: canonicalized_patterns[cgp] = (pid, gp) gp = cgp i += 1 logger.debug('generated pattern %d: %s', pid, gp) yield pid, gp assert pid + 1 == n_patterns logger.info( 'found %d differing patterns out of %d possible of length %d', i, n_patterns, length ) yield (n_patterns, None)
def test_mutate_merge_var(): p = Variable('p') q = Variable('q') gp = GraphPattern([(SOURCE_VAR, p, TARGET_VAR)]) res = mutate_merge_var(gp, 0) assert res == gp res = mutate_merge_var(gp, 1) assert res[0][1] in {SOURCE_VAR, TARGET_VAR} gp2 = gp + [(SOURCE_VAR, q, TARGET_VAR)] res = mutate_merge_var(gp2, 0) assert len(res) == 1, "?q must have become ?p or vice versa: %s" % len(res) assert res[0][1] in {p, q} a, b = False, False for i in range(100): res = mutate_merge_var(gp2, 1) if len(res) == 1: assert res[0][1] in {p, q} a = True else: # one of the edge vars must have become ?s or ?t assert {res[0][1], res[1][1]} & {SOURCE_VAR, TARGET_VAR} assert {res[0][1], res[1][1]} - {SOURCE_VAR, TARGET_VAR} b = True if a and b: break else: assert False, "merge never reached one of two cases: %s %s" % (a, b) gp2 = gp + [(q, p, TARGET_VAR)] a, b = False, False for i in range(100): res = mutate_merge_var(gp2, 0) if len(res) == 1: # q must have become ?source assert res == gp a = True else: # q became ?target assert res == gp + [(TARGET_VAR, p, TARGET_VAR)] b = True if a and b: break else: assert False, "merge never reached one of two cases: %s %s" % (a, b) cases = [False] * 4 for i in range(100): res = mutate_merge_var(gp2, 1) if len(res) == 1: # q must have become ?source assert res == gp cases[0] = True else: # ?q became ?target or ?p, or ?p one of {?q, ?source, ?target} if res == gp + [(TARGET_VAR, p, TARGET_VAR)]: cases[1] = True elif res == gp + [(p, p, TARGET_VAR)]: cases[2] = True else: assert res[0][1] in {q, SOURCE_VAR, TARGET_VAR} cases[3] = True if all(cases): break else: assert False, "merge never reached one of the cases: %s" % cases
def test_graph_pattern_stats(): gp = GraphPattern( ( (URIRef('bar'), URIRef('pred1'), URIRef('s')), (URIRef('foo'), URIRef('pred2'), URIRef('t')), (URIRef('s'), URIRef('pred3'), URIRef('t')), ), source_node=URIRef('s'), target_node=URIRef('t'), ) gp1 = GraphPattern( ( (URIRef('bar'), URIRef('pred1'), URIRef('s2')), (URIRef('foo'), URIRef('pred2'), URIRef('t')), (URIRef('s2'), URIRef('pred3'), URIRef('t')), (URIRef('single'), URIRef('pred1'), URIRef('s2')), ), source_node=URIRef('s2'), target_node=URIRef('t'), ) gps = GraphPatternStats() gps.add_graph_pattern(gp, URIRef('s'), URIRef('t')) identifiers = set(gp.identifier_counts(True)) assert identifiers == { URIRef('bar'), URIRef('foo'), URIRef('pred1'), URIRef('pred2'), URIRef('pred3'), }, identifiers assert identifiers == set(gps.identifier_gt_node_count.keys()) assert identifiers == set(gps.identifier_gt_pair_count.keys()) assert set([(i, 1) for i in identifiers]) == set( gps.identifier_gt_node_count.items()) assert set([(i, 1) for i in identifiers]) == set( gps.identifier_gt_pair_count.items()) gps.add_graph_pattern(gp1, URIRef('s2'), URIRef('t')) assert set(gps.identifier_gt_node_count.keys()) == \ identifiers | {URIRef('single')} assert set(gps.identifier_gt_pair_count.keys()) == \ identifiers | {URIRef('single')} expected_node = { (URIRef('bar'), 2), (URIRef('foo'), 1), (URIRef('pred1'), 2), (URIRef('pred2'), 1), (URIRef('pred3'), 2), (URIRef('single'), 1), } res = set(gps.identifier_gt_node_count.items()) assert expected_node == res, res expected_pair = { (URIRef('bar'), 2), (URIRef('foo'), 2), (URIRef('pred1'), 2), (URIRef('pred2'), 2), (URIRef('pred3'), 2), (URIRef('single'), 1), } res = set(gps.identifier_gt_pair_count.items()) assert expected_pair == res, res tmp = gps.min_identifier_gt_node_occurrences(gp) assert tmp == 1, 'tmp: %d\n%s' % (tmp, gps) tmp = gps.min_identifier_gt_pair_occurrences(gp) assert tmp == 2, 'tmp: %d\n%s' % (tmp, gps)
def pattern_generator(length, loops=True, exclude_isomorphic=True): canonicalized_patterns = {} possible_var_nodes = [Variable('n%d' % i) for i in range(length - 1)] possible_nodes = possible_var_nodes + [SOURCE_VAR, TARGET_VAR] possible_edges = [Variable('e%d' % i) for i in range(length)] possible_triples = [(s, p, o) for s in possible_nodes for p in possible_edges for o in possible_nodes] n_patterns = binom(len(possible_triples), length) logger.info('generating %d possible patterns of length %d', n_patterns, length) i = 0 pid = 0 for pid, pattern in enumerate(combinations(possible_triples, length)): gp = GraphPattern(pattern) # check that source and target are in gp: if not gp.complete(): logger.debug('excluded %d: source or target missing: %s', pid, gp) continue nodes = sorted(gp.nodes - {SOURCE_VAR, TARGET_VAR}) edges = sorted(gp.edges) # check there are no skipped nodes, e.g., link to n2 picked but no n1 if nodes != possible_var_nodes[:len(nodes)]: logger.debug('excluded %d: skipped node: %s', pid, gp) continue if edges != possible_edges[:len(edges)]: logger.debug('excluded %d: skipped edge: %s', pid, gp) continue # check for loops if necessary if not loops and any([s == o for s, p, o in gp]): logger.debug('excluded %d: loop: %s', pid, gp) continue # check that the pattern is connected if not gp.is_connected(): logger.debug('excluded %d: not connected:\n%s', pid, gp) continue # exclude patterns which are isomorphic to already generated ones if exclude_isomorphic: cgp = canonicalize(gp) if cgp in canonicalized_patterns: logger.debug('excluded %d: isomorphic to %d:\n%sand\n%s', pid, canonicalized_patterns[cgp][0], gp, canonicalized_patterns[cgp][1]) continue else: canonicalized_patterns[cgp] = (pid, gp) gp = cgp i += 1 logger.debug('generated pattern %d: %s', pid, gp) yield pid, gp assert pid + 1 == n_patterns logger.info('found %d differing patterns out of %d possible of length %d', i, n_patterns, length) yield (n_patterns, None)
def test_graph_pattern(): g = Graph() g.add((URIRef('foo'), URIRef('bar'), Literal('bla'))) g.add((URIRef('foo'), URIRef('baa'), Literal('bla'))) g.add((URIRef('faa'), URIRef('boo'), Literal('blub'))) gp = GraphPattern(g) gp = gp.replace({ URIRef('foo'): Variable('a'), Literal('bla'): Variable('l'), }) sparql = gp.to_sparql_select_query() expected = 'SELECT ?a ?l WHERE {\n' \ ' ?a <baa> ?l .\n' \ ' ?a <bar> ?l .\n' \ ' <faa> <boo> "blub" .\n' \ '}\n' assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql) sparql = gp.to_sparql_select_query() assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql) gp2 = gp.replace({URIRef('baa'): Variable('b')}) sparql = gp2.to_sparql_select_query( bind={Variable('a'): URIRef('bound')} ) expected = 'SELECT ?a ?b ?l WHERE {\n' \ ' ?a ?b ?l .\n' \ ' ?a <bar> ?l .\n' \ ' <faa> <boo> "blub" .\n' \ ' FILTER(\n' \ ' ?a=<bound>\n' \ ' )\n' \ '}\n' assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql) gp3 = GraphPattern(g, source_node=URIRef('foo'), target_node=Literal('bla')) expected = 'SELECT ?source ?target WHERE {\n' \ ' ?source <baa> ?target .\n' \ ' ?source <bar> ?target .\n' \ ' <faa> <boo> "blub" .\n' \ '}\n' sparql = gp3.to_sparql_select_query() assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql) gp4 = gp3.only_with([TARGET_VAR]) expected = 'SELECT ?source ?target WHERE {\n' \ ' ?source <baa> ?target .\n' \ ' ?source <bar> ?target .\n' \ '}\n' sparql = gp4.to_sparql_select_query() assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql) gp4_red = gp4.replace({URIRef('baa'): URIRef('bar')}) assert len(gp4) > len(gp4_red), \ "double edge should've been reduced: %s" % (gp4_red,) gp5 = gp3.only_with([URIRef('bar')]) expected = 'SELECT ?source ?target WHERE {\n' \ ' ?source <bar> ?target .\n' \ '}\n' sparql = gp5.to_sparql_select_query() assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql) gp6 = gp + gp2 expected = 'SELECT ?a ?b ?l WHERE {\n' \ ' ?a ?b ?l .\n' \ ' ?a <baa> ?l .\n' \ ' ?a <bar> ?l .\n' \ ' <faa> <boo> "blub" .\n' \ '}\n' sparql = gp6.to_sparql_select_query() assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql) gp7 = gp - gp2 expected = 'SELECT ?a ?l WHERE {\n' \ ' ?a <baa> ?l .\n' \ '}\n' sparql = gp7.to_sparql_select_query() assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql) gp8 = gp + ((TARGET_VAR, TARGET_VAR, TARGET_VAR),) expected = 'SELECT ?a ?l ?target WHERE {\n' \ ' ?a <baa> ?l .\n' \ ' ?a <bar> ?l .\n' \ ' ?target ?target ?target .\n' \ ' <faa> <boo> "blub" .\n' \ '}\n' sparql = gp8.to_sparql_select_query() assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql) gp9 = gp - gp assert not bool(gp9), 'gp9 was not empty' gp9 = gp - list(gp) assert not bool(gp9), 'gp9 - list(gp9) was not empty' # test triples by identifier: tbi = gp8.triples_by_identifier() expected = { Variable('a'): { (Variable('a'), URIRef('baa'), Variable('l')), (Variable('a'), URIRef('bar'), Variable('l')), }, Variable('l'): { (Variable('a'), URIRef('baa'), Variable('l')), (Variable('a'), URIRef('bar'), Variable('l')), }, URIRef('baa'): { (Variable('a'), URIRef('baa'), Variable('l')), }, URIRef('bar'): { (Variable('a'), URIRef('bar'), Variable('l')), }, Variable('target'): { (Variable('target'), Variable('target'), Variable('target')), }, URIRef('faa'): { (URIRef('faa'), URIRef('boo'), Literal('blub')), }, URIRef('boo'): { (URIRef('faa'), URIRef('boo'), Literal('blub')), }, Literal('blub'): { (URIRef('faa'), URIRef('boo'), Literal('blub')), }, } assert tbi == expected, 'triples_by_identifier %s != %s' % (tbi, expected) tbn = gp8.triples_by_nodes({ Variable('a'), Variable('target'), URIRef('notthere'), URIRef('faa'), URIRef('boo') }) expected = { Variable('a'): { (Variable('a'), URIRef('baa'), Variable('l')), (Variable('a'), URIRef('bar'), Variable('l')), }, Variable('target'): { (Variable('target'), Variable('target'), Variable('target')), }, URIRef('faa'): { (URIRef('faa'), URIRef('boo'), Literal('blub')), }, URIRef('notthere'): set(), URIRef('boo'): set(), } assert tbn == expected, 'triples_by_nodes %s != %s' % (tbn, expected) tbe = gp8.triples_by_edges({ URIRef('baa'), Variable('a'), Variable('?target') }) expected = { URIRef('baa'): { (Variable('a'), URIRef('baa'), Variable('l')), }, Variable('target'): { (Variable('target'), Variable('target'), Variable('target')), }, Variable('a'): set(), } assert tbe == expected, 'triples_by_edges %s != %s' % (tbe, expected)