def test_graph_pattern_canonicalization(): # test for bug in lib: # rdflib.compare.to_canonical_graph(g) sometimes collapses distinct bnodes # see https://github.com/RDFLib/rdflib/issues/494 # The GraphPattern below causes such a problem, currently we return gp # itself instead of a canonical representation of it. We just test the len # in case it's fixed in rdflib. gp = GraphPattern( ((SOURCE_VAR, Variable('vcb0'), TARGET_VAR), (SOURCE_VAR, Variable('vrBYUk8'), TARGET_VAR), (TARGET_VAR, Variable('vrBYUk8'), SOURCE_VAR), (TARGET_VAR, Variable('vrvGapn'), SOURCE_VAR))) cgp = canonicalize(gp) assert len(gp) == len(cgp) # test for a bug in canonicalization when it didn't rewrite fixed gp = GraphPattern(( (TARGET_VAR, Variable('v0'), SOURCE_VAR), (TARGET_VAR, Variable('v0'), Variable('v1')), (TARGET_VAR, Variable('v2'), Variable('v1')), (TARGET_VAR, Variable('v2'), Variable('v3')), (TARGET_VAR, Variable('v4'), Variable('v5')), )) cgp = canonicalize(gp) assert len(gp) == len(cgp)
def test_graph_pattern_canonicalization(): # test for bug in lib: # rdflib.compare.to_canonical_graph(g) sometimes collapses distinct bnodes # see https://github.com/RDFLib/rdflib/issues/494 # The GraphPattern below causes such a problem, currently we return gp # itself instead of a canonical representation of it. We just test the len # in case it's fixed in rdflib. gp = GraphPattern(( (SOURCE_VAR, Variable('vcb0'), TARGET_VAR), (SOURCE_VAR, Variable('vrBYUk8'), TARGET_VAR), (TARGET_VAR, Variable('vrBYUk8'), SOURCE_VAR), (TARGET_VAR, Variable('vrvGapn'), SOURCE_VAR))) cgp = canonicalize(gp) assert len(gp) == len(cgp) # test for a bug in canonicalization when it didn't rewrite fixed gp = GraphPattern(( (TARGET_VAR, Variable('v0'), SOURCE_VAR), (TARGET_VAR, Variable('v0'), Variable('v1')), (TARGET_VAR, Variable('v2'), Variable('v1')), (TARGET_VAR, Variable('v2'), Variable('v3')), (TARGET_VAR, Variable('v4'), Variable('v5')), )) cgp = canonicalize(gp) assert len(gp) == len(cgp)
def main(): length = 3 gen_patterns = list(pattern_generator(length)) for n, (i, pattern) in enumerate(gen_patterns): print('%d: Pattern id %d: %s' % (n, i, pattern)) patterns = set(gp for pid, gp in gen_patterns[:-1]) # testing flipped edges for gp in patterns: for i in range(length): mod_gp = gp.flip_edge(i) # can happen that flipped edge was there already if len(mod_gp) == length: assert canonicalize(mod_gp) in patterns
def pattern_generator(length, loops=True, exclude_isomorphic=True): canonicalized_patterns = {} possible_var_nodes = [Variable('n%d' % i) for i in range(length - 1)] possible_nodes = possible_var_nodes + [SOURCE_VAR, TARGET_VAR] possible_edges = [Variable('e%d' % i) for i in range(length)] possible_triples = [(s, p, o) for s in possible_nodes for p in possible_edges for o in possible_nodes] n_patterns = binom(len(possible_triples), length) logger.info('generating %d possible patterns of length %d', n_patterns, length) i = 0 pid = 0 for pid, pattern in enumerate(combinations(possible_triples, length)): gp = GraphPattern(pattern) # check that source and target are in gp: if not gp.complete(): logger.debug('excluded %d: source or target missing: %s', pid, gp) continue nodes = sorted(gp.nodes - {SOURCE_VAR, TARGET_VAR}) edges = sorted(gp.edges) # check there are no skipped nodes, e.g., link to n2 picked but no n1 if nodes != possible_var_nodes[:len(nodes)]: logger.debug('excluded %d: skipped node: %s', pid, gp) continue if edges != possible_edges[:len(edges)]: logger.debug('excluded %d: skipped edge: %s', pid, gp) continue # check for loops if necessary if not loops and any([s == o for s, p, o in gp]): logger.debug('excluded %d: loop: %s', pid, gp) continue # check that the pattern is connected if not gp.is_connected(): logger.debug('excluded %d: not connected:\n%s', pid, gp) continue # exclude patterns which are isomorphic to already generated ones if exclude_isomorphic: cgp = canonicalize(gp) if cgp in canonicalized_patterns: logger.debug('excluded %d: isomorphic to %d:\n%sand\n%s', pid, canonicalized_patterns[cgp][0], gp, canonicalized_patterns[cgp][1]) continue else: canonicalized_patterns[cgp] = (pid, gp) gp = cgp i += 1 logger.debug('generated pattern %d: %s', pid, gp) yield pid, gp assert pid + 1 == n_patterns logger.info('found %d differing patterns out of %d possible of length %d', i, n_patterns, length) yield (n_patterns, None)
def main(length=4): # len | pcon | nej | all | candidates (all) | candidates (all) | # | | | (canonical) | (old method) | (numerical) | # ----+------+-----+--------------+-------------------+-------------------+ # 1 | 8 | 12 | 12 | 27 | 12 | # 2 | 146 | 469 | 693 | 7750 | 1314 | # 3 | | | 47478 | 6666891 | 151534 | # 4 | | | | 11671285626 | 20884300 | # 5 | | | | 34549552710596 | 3461471628 | # len | typical | candidates | candidates | # | (canonical) | (old method) | (numerical) | # ----+-------------+----------------+-------------+ # 1 | 2 | 4 | 2 | # 2 | 28 | 153 | 54 | # 3 | 486 | 17296 | 1614 | # 4 | 10374 | 3921225 | 59654 | # 5 | | 1488847536 | 2707960 | # typical above means none of (loops, nej, pcon, source_target_edges) canonical = True _patterns = set() n = -1 i = 0 pg = patterns( length, loops=False, node_edge_joint=False, p_only_connected=False, source_target_edges=False, exclude_isomorphic=canonical and not scoop.IS_RUNNING, count_candidates_only=False, ) f = gzip.open( path.join('data', 'enumerated_patterns_len%d.jsonl.gz' % length), 'w') if canonical and scoop.IS_RUNNING: # Graph pattern isomorphism checking is what takes by far the longest. # run canonicalization in parallel # chunks used for efficiency and to hinder parallel_map from trying to # eat up all candidates first for chunk in chunker(pg, 100000): cgps = parallel_map( lambda res: (res[0], canonicalize(res[1]) if res[1] else None), chunk ) for i, pattern in cgps: if pattern not in _patterns: n += 1 print('%d: Pattern id %d: %s' % (n, i, pattern)) assert pattern is None or len(pattern) == length, \ 'pattern too short: %s' % (pattern,) _patterns.add(pattern) f.write(_jsonify(pattern)) else: # run potential canonicalization inline for n, (i, pattern) in enumerate(pg): print('%d: Pattern id %d: %s' % (n, i, pattern)) _patterns.add(pattern) f.write(_jsonify(pattern)) # last res of pg is (i, None) _patterns.remove(None) print('Number of pattern candidates: %d' % i) print('Number of patterns: %d' % n) # testing flipped edges (only works if we're working with canonicals) if canonical: mod_gps = [] for gp in _patterns: for i in range(length): mod_gp = gp.flip_edge(i) # can happen that flipped edge was there already if len(mod_gp) == length: mod_gps.append(mod_gp) cmod_pgs = parallel_map( canonicalize, mod_gps ) for i, cmod_pg in enumerate(cmod_pgs): assert cmod_pg in _patterns, \ 'not in patterns: mod_gp: %scanon: %s_patterns: %r...' % ( mod_gps[i], cmod_pg, list(_patterns)[:20] )
def pattern_generator( length, loops=True, node_edge_joint=True, p_only_connected=True, source_target_edges=True, exclude_isomorphic=True, count_candidates_only=False, ): assert not source_target_edges or node_edge_joint, \ 'source_target_edges cannot be used without node_edge_joint' canonicalized_patterns = {} if node_edge_joint: # To be connected there are max 3 + 2 + 2 + 2 + ... vars for triples. # The first can be 3 different ones (including ?source and ?target, then # in each of the following triples at least one var has to be an old one possible_vars = [Variable('v%d' % i) for i in range((2 * length) - 1)] possible_nodes = possible_vars + [SOURCE_VAR, TARGET_VAR] if source_target_edges: possible_edges = possible_nodes else: possible_edges = possible_vars else: possible_var_nodes = [Variable('n%d' % i) for i in range(length - 1)] possible_nodes = possible_var_nodes + [SOURCE_VAR, TARGET_VAR] possible_edges = [Variable('e%d' % i) for i in range(length)] possible_triples = [ (s, p, o) for s in possible_nodes for p in possible_edges for o in possible_nodes ] n_patterns = binom(len(possible_triples), length) logger.info( 'generating %d possible patterns of length %d', n_patterns, length) if count_candidates_only: yield (n_patterns, None) return i = 0 pid = 0 for pid, pattern in enumerate(combinations(possible_triples, length)): gp = GraphPattern(pattern) # check that source and target are in gp: if not gp.complete(): logger.debug( 'excluded %d: source or target missing: %s', pid, gp) continue nodes = sorted(gp.nodes - {SOURCE_VAR, TARGET_VAR}) edges = sorted(gp.edges - {SOURCE_VAR, TARGET_VAR}) vars_ = sorted(gp.vars_in_graph - {SOURCE_VAR, TARGET_VAR}) # check there are no skipped variables (nodes or edges) # noinspection PyUnboundLocalVariable if ( (node_edge_joint and vars_ != possible_vars[:len(vars_)]) or (not node_edge_joint and ( nodes != possible_var_nodes[:len(nodes)] or edges != possible_edges[:len(edges)] )) ): logger.debug('excluded %d: skipped var: %s', pid, gp) continue # check if nodes and edges are disjoint if not node_edge_joint and (gp.nodes & gp.edges): logger.debug('excluded %d: node-edge-joined: %s', pid, gp) continue # check for loops if necessary if not loops and any([s == o for s, p, o in gp]): logger.debug('excluded %d: loop: %s', pid, gp) continue # check that the pattern is connected if not gp.is_connected(via_edges=p_only_connected): logger.debug('excluded %d: not connected:\n%s', pid, gp) continue # exclude patterns which are isomorphic to already generated ones if exclude_isomorphic: cgp = canonicalize(gp) if cgp in canonicalized_patterns: logger.debug( 'excluded %d: isomorphic to %d:\n%sand\n%s', pid, canonicalized_patterns[cgp][0], gp, canonicalized_patterns[cgp][1] ) continue else: canonicalized_patterns[cgp] = (pid, gp) gp = cgp i += 1 logger.debug('generated pattern %d: %s', pid, gp) yield pid, gp assert pid + 1 == n_patterns logger.info( 'found %d differing patterns out of %d possible of length %d', i, n_patterns, length ) yield (n_patterns, None)
def patterns( length, loops=True, node_edge_joint=True, p_only_connected=True, source_target_edges=True, exclude_isomorphic=True, count_candidates_only=False, ): """Takes a numerical pattern and generates actual patterns from it.""" assert not count_candidates_only or not exclude_isomorphic, \ 'count_candidates_only cannot be used with isomorphism check' assert not source_target_edges or node_edge_joint, \ 'source_target_edges cannot be used without node_edge_joint' canonicalized_patterns = {} pid = -1 for c, num_pat in enumerate(numerical_patterns( length, loops=loops, node_edge_joint=node_edge_joint, )): assert(len(num_pat)) == length, 'too short: %s' % (num_pat,) flat_num_pat = [v for t in num_pat for v in t] all_numbers = set(flat_num_pat) if not p_only_connected: # Numerical patterns are always connected, but they might be # p_only_connected (e.g., 123 425). # Check that the pattern isn't p_only_connected, meaning that it's # also connected by nodes (e.g., 123 325). # Note that in case of node_edge_joint 123 245 is also considered # p_only_connected. if not nx.is_connected(to_nx_graph(num_pat)): logger.debug('excluded %d: not node connected:\n%s', c, num_pat) continue if source_target_edges: all_numbers = sorted(all_numbers) numbers = all_numbers else: numbers = sorted(all_numbers - set(flat_num_pat[1::3])) all_numbers = sorted(all_numbers) if count_candidates_only: l = len(numbers) perms = l * (l-1) pid += perms # yield pid, None # way slower, rather show progress from here: if c % 100000 == 0: logger.info( 'pattern id: %d, vars: %d, permutations: %d', pid, l, perms ) continue for s, t in permutations(numbers, 2): pid += 1 # source and target are mapped to numbers s and t # re-enumerate the leftover numbers to close "holes" leftover_numbers = [n for n in all_numbers if n != s and n != t] var_map = {n: Variable('v%d' % i) for i, n in enumerate(leftover_numbers)} var_map[s] = SOURCE_VAR var_map[t] = TARGET_VAR gp = GraphPattern( tuple([tuple([var_map[i] for i in trip]) for trip in num_pat])) assert len(gp) == length, \ 'gp too short: num %s\n%s' % (num_pat, gp) # exclude patterns which are isomorphic to already generated ones if exclude_isomorphic: cgp = canonicalize(gp) if cgp in canonicalized_patterns: igp = canonicalized_patterns[cgp] igp_numpat, igp_s, igp_t, igp_gp = igp logger.debug( 'excluded isomorphic %s with ?s=%d, ?t=%d:\n' 'isomorphic to %s with ?s=%d, ?t=%d:\n' '%sand\n%s', num_pat, s, t, igp_numpat, igp_s, igp_t, gp, igp_gp, ) continue else: canonicalized_patterns[cgp] = (num_pat, s, t, gp) gp = cgp yield pid, gp yield pid + 1, None
def main(): # len | pcon | nej | all | candidates (all) | candidates (all) | # | | | (canonical) | (old method) | (numerical) | # ----+------+-----+--------------+-------------------+-------------------+ # 1 | 8 | 12 | 12 | 27 | 12 | # 2 | 146 | 469 | 693 | 7750 | 1314 | # 3 | | | 47478 | 6666891 | 151534 | # 4 | | | | 11671285626 | 20884300 | # 5 | | | | 34549552710596 | 3461471628 | # len | typical | candidates | candidates | # | (canonical) | (old method) | (numerical) | # ----+-------------+----------------+-------------+ # 1 | 2 | 4 | 2 | # 2 | 28 | 153 | 54 | # 3 | 486 | 17296 | 1614 | # 4 | 10374 | 3921225 | 59654 | # 5 | | 1488847536 | 2707960 | # typical above means none of (loops, nej, pcon, source_target_edges) length = 5 canonical = True _patterns = set() n = -1 i = 0 pg = patterns( length, loops=False, node_edge_joint=False, p_only_connected=False, source_target_edges=False, exclude_isomorphic=canonical and not scoop.IS_RUNNING, count_candidates_only=False, ) if canonical and scoop.IS_RUNNING: # Graph pattern isomorphism checking is what takes by far the longest. # run canonicalization in parallel # chunks used for efficiency and to hinder parallel_map from trying to # eat up all candidates first for chunk in chunker(pg, 100000): cgps = parallel_map( lambda res: (res[0], canonicalize(res[1]) if res[1] else None), chunk) for i, pattern in cgps: if pattern not in _patterns: n += 1 print('%d: Pattern id %d: %s' % (n, i, pattern)) assert pattern is None or len(pattern) == length, \ 'pattern too short: %s' % (pattern,) _patterns.add(pattern) else: # run potential canonicalization inline for n, (i, pattern) in enumerate(pg): print('%d: Pattern id %d: %s' % (n, i, pattern)) _patterns.add(pattern) # last res of pg is (i, None) _patterns.remove(None) print('Number of pattern candidates: %d' % i) print('Number of patterns: %d' % n) # testing flipped edges (only works if we're working with canonicals) if canonical: mod_gps = [] for gp in _patterns: for i in range(length): mod_gp = gp.flip_edge(i) # can happen that flipped edge was there already if len(mod_gp) == length: mod_gps.append(mod_gp) cmod_pgs = parallel_map(canonicalize, mod_gps) for i, cmod_pg in enumerate(cmod_pgs): assert cmod_pg in _patterns, \ 'not in patterns: mod_gp: %scanon: %s_patterns: %r...' % ( mod_gps[i], cmod_pg, list(_patterns)[:20] )
def pattern_generator( length, loops=True, node_edge_joint=True, p_only_connected=True, source_target_edges=True, exclude_isomorphic=True, count_candidates_only=False, ): assert not source_target_edges or node_edge_joint, \ 'source_target_edges cannot be used without node_edge_joint' canonicalized_patterns = {} if node_edge_joint: # To be connected there are max 3 + 2 + 2 + 2 + ... vars for triples. # The first can be 3 different ones (including ?source and ?target, then # in each of the following triples at least one var has to be an old one possible_vars = [Variable('v%d' % i) for i in range((2 * length) - 1)] possible_nodes = possible_vars + [SOURCE_VAR, TARGET_VAR] if source_target_edges: possible_edges = possible_nodes else: possible_edges = possible_vars else: possible_var_nodes = [Variable('n%d' % i) for i in range(length - 1)] possible_nodes = possible_var_nodes + [SOURCE_VAR, TARGET_VAR] possible_edges = [Variable('e%d' % i) for i in range(length)] possible_triples = [(s, p, o) for s in possible_nodes for p in possible_edges for o in possible_nodes] n_patterns = binom(len(possible_triples), length) logger.info('generating %d possible patterns of length %d', n_patterns, length) if count_candidates_only: yield (n_patterns, None) return i = 0 pid = 0 for pid, pattern in enumerate(combinations(possible_triples, length)): gp = GraphPattern(pattern) # check that source and target are in gp: if not gp.complete(): logger.debug('excluded %d: source or target missing: %s', pid, gp) continue nodes = sorted(gp.nodes - {SOURCE_VAR, TARGET_VAR}) edges = sorted(gp.edges - {SOURCE_VAR, TARGET_VAR}) vars_ = sorted(gp.vars_in_graph - {SOURCE_VAR, TARGET_VAR}) # check there are no skipped variables (nodes or edges) # noinspection PyUnboundLocalVariable if ((node_edge_joint and vars_ != possible_vars[:len(vars_)]) or (not node_edge_joint and (nodes != possible_var_nodes[:len(nodes)] or edges != possible_edges[:len(edges)]))): logger.debug('excluded %d: skipped var: %s', pid, gp) continue # check if nodes and edges are disjoint if not node_edge_joint and (gp.nodes & gp.edges): logger.debug('excluded %d: node-edge-joined: %s', pid, gp) continue # check for loops if necessary if not loops and any([s == o for s, p, o in gp]): logger.debug('excluded %d: loop: %s', pid, gp) continue # check that the pattern is connected if not gp.is_connected(via_edges=p_only_connected): logger.debug('excluded %d: not connected:\n%s', pid, gp) continue # exclude patterns which are isomorphic to already generated ones if exclude_isomorphic: cgp = canonicalize(gp) if cgp in canonicalized_patterns: logger.debug('excluded %d: isomorphic to %d:\n%sand\n%s', pid, canonicalized_patterns[cgp][0], gp, canonicalized_patterns[cgp][1]) continue else: canonicalized_patterns[cgp] = (pid, gp) gp = cgp i += 1 logger.debug('generated pattern %d: %s', pid, gp) yield pid, gp assert pid + 1 == n_patterns logger.info('found %d differing patterns out of %d possible of length %d', i, n_patterns, length) yield (n_patterns, None)
def patterns( length, loops=True, node_edge_joint=True, p_only_connected=True, source_target_edges=True, exclude_isomorphic=True, count_candidates_only=False, ): """Takes a numerical pattern and generates actual patterns from it.""" assert not count_candidates_only or not exclude_isomorphic, \ 'count_candidates_only cannot be used with isomorphism check' assert not source_target_edges or node_edge_joint, \ 'source_target_edges cannot be used without node_edge_joint' canonicalized_patterns = {} pid = -1 for c, num_pat in enumerate( numerical_patterns( length, loops=loops, node_edge_joint=node_edge_joint, )): assert (len(num_pat)) == length, 'too short: %s' % (num_pat, ) flat_num_pat = [v for t in num_pat for v in t] all_numbers = set(flat_num_pat) if not p_only_connected: # Numerical patterns are always connected, but they might be # p_only_connected (e.g., 123 425). # Check that the pattern isn't p_only_connected, meaning that it's # also connected by nodes (e.g., 123 325). # Note that in case of node_edge_joint 123 245 is also considered # p_only_connected. if not nx.is_connected(to_nx_graph(num_pat)): logger.debug('excluded %d: not node connected:\n%s', c, num_pat) continue if source_target_edges: all_numbers = sorted(all_numbers) numbers = all_numbers else: numbers = sorted(all_numbers - set(flat_num_pat[1::3])) all_numbers = sorted(all_numbers) if count_candidates_only: l = len(numbers) perms = l * (l - 1) pid += perms # yield pid, None # way slower, rather show progress from here: if c % 100000 == 0: logger.info('pattern id: %d, vars: %d, permutations: %d', pid, l, perms) continue for s, t in permutations(numbers, 2): pid += 1 # source and target are mapped to numbers s and t # re-enumerate the leftover numbers to close "holes" leftover_numbers = [n for n in all_numbers if n != s and n != t] var_map = { n: Variable('v%d' % i) for i, n in enumerate(leftover_numbers) } var_map[s] = SOURCE_VAR var_map[t] = TARGET_VAR gp = GraphPattern( tuple([tuple([var_map[i] for i in trip]) for trip in num_pat])) assert len(gp) == length, \ 'gp too short: num %s\n%s' % (num_pat, gp) # exclude patterns which are isomorphic to already generated ones if exclude_isomorphic: cgp = canonicalize(gp) if cgp in canonicalized_patterns: igp = canonicalized_patterns[cgp] igp_numpat, igp_s, igp_t, igp_gp = igp logger.debug( 'excluded isomorphic %s with ?s=%d, ?t=%d:\n' 'isomorphic to %s with ?s=%d, ?t=%d:\n' '%sand\n%s', num_pat, s, t, igp_numpat, igp_s, igp_t, gp, igp_gp, ) continue else: canonicalized_patterns[cgp] = (num_pat, s, t, gp) gp = cgp yield pid, gp yield pid + 1, None
def pattern_generator(length, loops=True, exclude_isomorphic=True): canonicalized_patterns = {} possible_var_nodes = [Variable('n%d' % i) for i in range(length - 1)] possible_nodes = possible_var_nodes + [SOURCE_VAR, TARGET_VAR] possible_edges = [Variable('e%d' % i) for i in range(length)] possible_triples = [ (s, p, o) for s in possible_nodes for p in possible_edges for o in possible_nodes ] n_patterns = binom(len(possible_triples), length) logger.info( 'generating %d possible patterns of length %d', n_patterns, length) i = 0 pid = 0 for pid, pattern in enumerate(combinations(possible_triples, length)): gp = GraphPattern(pattern) # check that source and target are in gp: if not gp.complete(): logger.debug( 'excluded %d: source or target missing: %s', pid, gp) continue nodes = sorted(gp.nodes - {SOURCE_VAR, TARGET_VAR}) edges = sorted(gp.edges) # check there are no skipped nodes, e.g., link to n2 picked but no n1 if nodes != possible_var_nodes[:len(nodes)]: logger.debug('excluded %d: skipped node: %s', pid, gp) continue if edges != possible_edges[:len(edges)]: logger.debug('excluded %d: skipped edge: %s', pid, gp) continue # check for loops if necessary if not loops and any([s == o for s, p, o in gp]): logger.debug('excluded %d: loop: %s', pid, gp) continue # check that the pattern is connected if not gp.is_connected(): logger.debug('excluded %d: not connected:\n%s', pid, gp) continue # exclude patterns which are isomorphic to already generated ones if exclude_isomorphic: cgp = canonicalize(gp) if cgp in canonicalized_patterns: logger.debug( 'excluded %d: isomorphic to %d:\n%sand\n%s', pid, canonicalized_patterns[cgp][0], gp, canonicalized_patterns[cgp][1] ) continue else: canonicalized_patterns[cgp] = (pid, gp) gp = cgp i += 1 logger.debug('generated pattern %d: %s', pid, gp) yield pid, gp assert pid + 1 == n_patterns logger.info( 'found %d differing patterns out of %d possible of length %d', i, n_patterns, length ) yield (n_patterns, None)