def test_graph_pattern_canonicalization():
    # test for bug in lib:
    # rdflib.compare.to_canonical_graph(g) sometimes collapses distinct bnodes
    # see https://github.com/RDFLib/rdflib/issues/494
    # The GraphPattern below causes such a problem, currently we return gp
    # itself instead of a canonical representation of it. We just test the len
    # in case it's fixed in rdflib.
    gp = GraphPattern(
        ((SOURCE_VAR, Variable('vcb0'),
          TARGET_VAR), (SOURCE_VAR, Variable('vrBYUk8'), TARGET_VAR),
         (TARGET_VAR, Variable('vrBYUk8'),
          SOURCE_VAR), (TARGET_VAR, Variable('vrvGapn'), SOURCE_VAR)))
    cgp = canonicalize(gp)
    assert len(gp) == len(cgp)

    # test for a bug in canonicalization when it didn't rewrite fixed
    gp = GraphPattern((
        (TARGET_VAR, Variable('v0'), SOURCE_VAR),
        (TARGET_VAR, Variable('v0'), Variable('v1')),
        (TARGET_VAR, Variable('v2'), Variable('v1')),
        (TARGET_VAR, Variable('v2'), Variable('v3')),
        (TARGET_VAR, Variable('v4'), Variable('v5')),
    ))
    cgp = canonicalize(gp)
    assert len(gp) == len(cgp)
def load_results(fn):
    logger.info('loading results from: %s', fn)
    with gzip.open(fn) as f:
        res = json.load(f)
    result_patterns = [
        (GraphPattern.from_dict(pattern_run['graph_pattern']),
         pattern_run['found_in_run'])
        for pattern_run in res['patterns']
    ]
    coverage_counts = Counter({
        (decurify(s), decurify(t)): c
        for (s, t), c in res.get('coverage_counts', [])
    })
    gtp_scores = None
    gtps = [tuple(gtp) for gtp in res.get('ground_truth_pairs')]
    if gtps:
        coverage_max_precision = res.get('overall_coverage_max_precision', [])
        if not coverage_max_precision:
            # final result file for example
            coverage_max_precision = res.get('coverage_max_precision', [])
        gtp_scores = GTPScores(gtps)
        gtp_scores.gtp_max_precisions = OrderedDict([
            ((decurify(s), decurify(t)), mp)
            for (s, t), mp in coverage_max_precision
        ])
    logger.info('loaded %d result patterns', len(result_patterns))
    return result_patterns, coverage_counts, gtp_scores
def load_results(fn):
    logger.info('loading results from: %s', fn)
    with gzip.open(fn) as f:
        res = json.load(f)
    result_patterns = [
        (GraphPattern.from_dict(pattern_run['graph_pattern']),
         pattern_run['found_in_run'])
        for pattern_run in res['patterns']
    ]
    coverage_counts = Counter({
        (decurify(s), decurify(t)): c
        for (s, t), c in res.get('coverage_counts', [])
    })
    gtp_scores = None
    gtps = [tuple(gtp) for gtp in res.get('ground_truth_pairs')]
    if gtps:
        coverage_max_precision = res.get('overall_coverage_max_precision', [])
        if not coverage_max_precision:
            # final result file for example
            coverage_max_precision = res.get('coverage_max_precision', [])
        gtp_scores = GTPScores(gtps)
        gtp_scores.gtp_max_precisions = OrderedDict([
            ((decurify(s), decurify(t)), mp)
            for (s, t), mp in coverage_max_precision
        ])
    logger.info('loaded %d result patterns', len(result_patterns))
    return result_patterns, coverage_counts, gtp_scores
def test_predict_query():
    source = URIRef('http://dbpedia.org/resource/Algebra')
    target = URIRef('http://dbpedia.org/resource/Mathematics')
    gp = GraphPattern([(SOURCE_VAR, wpl, TARGET_VAR),
                       (TARGET_VAR, wpl, SOURCE_VAR),
                       (TARGET_VAR, a, Variable('target_type'))])
    t, res = predict_query(sparql, timeout, gp, source)
    assert len(res) > 0
    assert target in res
def test_mutate_fix_var():
    # tests on a small subset
    ground_truth_pairs_ = [
        (dbp['Armour'], dbp['Knight']),
        (dbp['Barrel'], dbp['Wine']),
        (dbp['Barrister'], dbp['Law']),
        (dbp['Barrister'], dbp['Lawyer']),
        (dbp['Beak'], dbp['Bird']),
        (dbp['Beetroot'], dbp['Red']),
        (dbp['Belief'], dbp['Religion']),
        (dbp['Blanket'], dbp['Bed']),
        (dbp['Boot'], dbp['Shoe']),
        (dbp['Brine'], dbp['Salt']),
    ]
    v = Variable('v')
    gtp_scores_ = GTPScores(ground_truth_pairs_)
    gp = GraphPattern([
        (SOURCE_VAR, v, TARGET_VAR),
    ])
    tgps = mutate_fix_var(sparql, timeout, gtp_scores_, gp)
    assert tgps
    for tgp in tgps:
        logger.info(tgp.to_sparql_select_query())
        assert gp != tgp
        assert v not in tgp.vars_in_graph
    gp = GraphPattern([
        (SOURCE_VAR, v, TARGET_VAR),
        (SOURCE_VAR, a, Variable('source_type')),
        (TARGET_VAR, a, URIRef('http://schema.org/Country')),
    ])
    tgps = mutate_fix_var(sparql, timeout, gtp_scores_, gp, rand_var=v)
    assert tgps
    for tgp in tgps:
        logger.info(tgp.to_sparql_select_query())
        assert gp == tgp, 'should not have found any substitution'
    ground_truth_pairs_.append((dbp['Berlin'], dbp['Germany']))
    gtp_scores_ = GTPScores(ground_truth_pairs_)
    tgps = mutate_fix_var(sparql, timeout, gtp_scores_, gp)
    assert tgps
    for tgp in tgps:
        logger.info(tgp.to_sparql_select_query())
        assert gp != tgp, 'should have found a substitution'
        assert gp.vars_in_graph - tgp.vars_in_graph
def test_variable_substitution_query():
    source_target_pairs = [
        (URIRef('http://dbpedia.org/resource/Adolescence'),
         URIRef('http://dbpedia.org/resource/Youth')),
        (URIRef('http://dbpedia.org/resource/Adult'),
         URIRef('http://dbpedia.org/resource/Child')),
        (URIRef('http://dbpedia.org/resource/Affinity_(law)'),
         URIRef('http://dbpedia.org/resource/Mother')),
        (URIRef('http://dbpedia.org/resource/Alchemy'),
         URIRef('http://dbpedia.org/resource/Gold')),
        (URIRef('http://dbpedia.org/resource/Alderman'),
         URIRef('http://dbpedia.org/resource/Mayor')),
        (URIRef('http://dbpedia.org/resource/Algebra'),
         URIRef('http://dbpedia.org/resource/Mathematics')),
        (URIRef('http://dbpedia.org/resource/Amen'),
         URIRef('http://dbpedia.org/resource/Prayer')),
        (URIRef('http://dbpedia.org/resource/Amnesia'),
         URIRef('http://dbpedia.org/resource/Memory')),
        (URIRef('http://dbpedia.org/resource/Angel'),
         URIRef('http://dbpedia.org/resource/Heaven')),
        (URIRef('http://dbpedia.org/resource/Arithmetic'),
         URIRef('http://dbpedia.org/resource/Mathematics')),
    ]

    gp = GraphPattern([
        (SOURCE_VAR, Variable('edge'), TARGET_VAR),
        (SOURCE_VAR, a, Variable('source_type')),
        (TARGET_VAR, a, Variable('target_type')),
    ])
    limit = MUTPB_FV_QUERY_LIMIT
    t, res = variable_substitution_query(sparql, timeout, gp, Variable('edge'),
                                         source_target_pairs, limit)
    logger.debug(res.most_common())
    assert res.most_common()[0][0] == wpl, 2

    gp = GraphPattern([
        (Variable('var'), wpl, SOURCE_VAR),
    ])
    t, res = variable_substitution_query(sparql, timeout, gp, Variable('var'),
                                         source_target_pairs, limit)
    logger.debug(res.most_common())
    assert (URIRef('http://dbpedia.org/resource/Human'),
            3) in res.most_common()
def test_evaluate():
    gp = GraphPattern((
        (SOURCE_VAR, wikilink, TARGET_VAR),
        (SOURCE_VAR, a, URIRef('http://dbpedia.org/ontology/PopulatedPlace')),
        (TARGET_VAR, a, URIRef('http://schema.org/Country'))
    ))
    res = evaluate(
        sparql, timeout, gtp_scores, gp)
    # (655, 0.4048, 0.4048, 0.0089, 7.5, 3, 3, 2, 0, 0.1936)
    # (remains, score, gain, f_measure, avg_reslens, gt_matches,
    #  patlen, patvars, timeout, qtime)
    update_individuals([gp], [res])
    fitness = gp.fitness.values
    matching_node_pairs = gp.matching_node_pairs
    gtp_precisions = gp.gtp_precisions
    gp.matching_node_pairs = matching_node_pairs
    logger.info(gp.matching_node_pairs)

    assert fitness.remains == len(ground_truth_pairs), 'remains wrong?'
    assert fitness.gt_matches == 3, "didn't match 3 gt pairs?"
    score = fitness.score
    assert 0 < score < 0.5, 'score not correct?'
    assert score == fitness.gain, 'score and gain should be the same here'
    assert 0 < fitness.f_measure < 0.1, \
        'f1 measure not correct?'
    assert fitness.patlen == 3, 'pattern should have 3 triples'
    assert fitness.patvars == 2, 'pattern should have 2 vars'
    if not query_time_soft_exceeded(fitness.qtime, timeout):
        assert 0 < fitness.avg_reslens < 10, \
            'avg match count should be ~7.5'
        assert fitness.timeout == 0, 'should not be a timeout'
    else:
        assert 0 < fitness.avg_reslens < 15, \
            'avg match count out of bounds for timeout'
        assert fitness.timeout > 0, 'should be a timeout'

    assert isinstance(gtp_precisions, OrderedDict)
    assert list(gtp_precisions) == matching_node_pairs
    logger.info(gtp_precisions)
    assert sum(gtp_precisions.values()) == fitness.gain, \
        'sum of precisions should be gain in this case'
def test_evaluate():
    gp = GraphPattern(
        ((SOURCE_VAR, wikilink, TARGET_VAR),
         (SOURCE_VAR, a, URIRef('http://dbpedia.org/ontology/PopulatedPlace')),
         (TARGET_VAR, a, URIRef('http://schema.org/Country'))))
    res = evaluate(sparql, timeout, gtp_scores, gp)
    # (655, 0.4048, 0.4048, 0.0089, 7.5, 3, 3, 2, 0, 0.1936)
    # (remains, score, gain, f_measure, avg_reslens, gt_matches,
    #  patlen, patvars, timeout, qtime)
    update_individuals([gp], [res])
    fitness = gp.fitness.values
    matching_node_pairs = gp.matching_node_pairs
    gtp_precisions = gp.gtp_precisions
    gp.matching_node_pairs = matching_node_pairs
    logger.info(gp.matching_node_pairs)

    assert fitness.remains == len(ground_truth_pairs), 'remains wrong?'
    assert fitness.gt_matches == 3, "didn't match 3 gt pairs?"
    score = fitness.score
    assert 0 < score < 0.5, 'score not correct?'
    assert score == fitness.gain, 'score and gain should be the same here'
    assert 0 < fitness.f_measure < 0.1, \
        'f1 measure not correct?'
    assert fitness.patlen == 3, 'pattern should have 3 triples'
    assert fitness.patvars == 2, 'pattern should have 2 vars'
    if not query_time_soft_exceeded(fitness.qtime, timeout):
        assert 0 < fitness.avg_reslens < 10, \
            'avg match count should be ~7.5'
        assert fitness.timeout == 0, 'should not be a timeout'
    else:
        assert 0 < fitness.avg_reslens < 15, \
            'avg match count out of bounds for timeout'
        assert fitness.timeout > 0, 'should be a timeout'

    assert isinstance(gtp_precisions, OrderedDict)
    assert list(gtp_precisions) == matching_node_pairs
    logger.info(gtp_precisions)
    assert sum(gtp_precisions.values()) == fitness.gain, \
        'sum of precisions should be gain in this case'
def test_timeout_pattern():
    u = URIRef('http://dbpedia.org/resource/Template:Reflist')
    wpdisambig = URIRef('http://dbpedia.org/ontology/wikiPageDisambiguates')
    gp = GraphPattern([
        (SOURCE_VAR, Variable('v1'), u),
        (SOURCE_VAR, Variable('v5'), u),
        (TARGET_VAR, Variable('v0'), u),
        (TARGET_VAR, Variable('v3'), u),
        (TARGET_VAR, Variable('v6'), Variable('v2')),
        (Variable('v4'), wpdisambig, TARGET_VAR),
    ])
    res = evaluate(sparql, timeout, gtp_scores, gp)
    update_individuals([gp], [res])
    fitness = gp.fitness.values
    matching_node_pairs = gp.matching_node_pairs
    gp.matching_node_pairs = matching_node_pairs
    logger.info(gp.matching_node_pairs)
    assert query_time_soft_exceeded(fitness.qtime, timeout)
    assert fitness.score == 0
    if query_time_hard_exceeded(fitness.qtime, timeout):
        assert fitness.f_measure == 0
    else:
        assert fitness.f_measure > 0
def test_timeout_pattern():
    u = URIRef('http://dbpedia.org/resource/Template:Reflist')
    wpdisambig = URIRef('http://dbpedia.org/ontology/wikiPageDisambiguates')
    gp = GraphPattern([
        (SOURCE_VAR, Variable('v1'), u),
        (SOURCE_VAR, Variable('v5'), u),
        (TARGET_VAR, Variable('v0'), u),
        (TARGET_VAR, Variable('v3'), u),
        (TARGET_VAR, Variable('v6'), Variable('v2')),
        (Variable('v4'), wpdisambig, TARGET_VAR),
    ])
    res = evaluate(
        sparql, timeout, gtp_scores, gp)
    update_individuals([gp], [res])
    fitness = gp.fitness.values
    matching_node_pairs = gp.matching_node_pairs
    gp.matching_node_pairs = matching_node_pairs
    logger.info(gp.matching_node_pairs)
    assert query_time_soft_exceeded(fitness.qtime, timeout)
    assert fitness.score == 0
    if query_time_hard_exceeded(fitness.qtime, timeout):
        assert fitness.f_measure == 0
    else:
        assert fitness.f_measure > 0
def test_graph_pattern_connectedness():
    # test edge var connections
    gp = GraphPattern([
        (SOURCE_VAR, Variable('p'), Variable('v1')),
        (TARGET_VAR, Variable('p'), Variable('v2')),
    ])
    assert not gp.is_connected(), \
        "shouldn't be connected with nodes only: %s" % (gp,)
    assert gp.is_connected(via_edges=True), \
        "should be connected via edges: %s" % (gp,)
    gp = GraphPattern([
        (SOURCE_VAR, Variable('p'), Variable('v1')),
        (Variable('p'), Variable('v2'), TARGET_VAR),
    ])
    assert not gp.is_connected(), \
        "shouldn't be connected with nodes only: %s" % (gp,)
    assert gp.is_connected(via_edges=True), \
        "should be connected via edges: %s" % (gp,)
Example #12
0
def main():
    from rdflib import Variable
    gp = GraphPattern((
        (SOURCE_VAR, Variable('v1'), Variable('v2')),
        (TARGET_VAR, Variable('v3'), Variable('v2')),
    ))
    # get list of semantic association pairs and split in train and test sets
    semantic_associations = get_semantic_associations(
        fn='data/dbpedia_random_1000k_uri_pairs.csv.gz',
        limit=None,
    )
    # assocs_train, assocs_test = split_training_test_set(
    #     semantic_associations
    # )
    # stps = tuple(sorted(assocs_train))
    stps = semantic_associations

    triples = generate_triples(gp, stps)
    load_triples_into_endpoint(triples)
def random_path(length):
    """Returns a random path with given length between source and target.

    Paths look like:
        (?source, ?ve1, ?vn1), (?vn1, ?ve2, ?vn2), ... (?vn(l-1), ?vel, ?target)

    As every edge can be flipped randomly.
    """
    assert length > 0
    edges = [Variable('ve%d' % i) for i in range(1, length + 1)]
    nodes = [Variable('vn%d' % i) for i in range(1, length)] + [TARGET_VAR]
    s = SOURCE_VAR  # start at source
    triples = []
    for e, n in zip(edges, nodes):
        triples.append((s, e, n))
        s = n
    gp = GraphPattern([(o, p, s) if random.random() < .5 else (s, p, o)
                       for s, p, o in triples])
    return gp
def simple_paths(length):
    """Returns all paths with given length between source and target.
    
    Paths look like:
        (?source, ?ve1, ?vn1), (?vn1, ?ve2, ?vn2), ... (?vn(l-1), ?vel, ?target)
    
    As every edge can be flipped, there are 2**length returned paths.
    """
    assert length > 0
    edges = [Variable('ve%d' % i) for i in range(1, length + 1)]
    nodes = [Variable('vn%d' % i) for i in range(1, length)] + [TARGET_VAR]
    s = SOURCE_VAR  # start at source
    triples = []
    for e, n in zip(edges, nodes):
        triples.append((s, e, n))
        s = n
    for n, edges_to_flip in enumerate(powerset(range(length))):
        gp = GraphPattern([(o, p, s) if i in edges_to_flip else (s, p, o)
                           for i, (s, p, o) in enumerate(triples)])
        yield n, gp
def main():
    from rdflib import Variable
    # the following triple will timeout if vars_joint was 0:
    # ?s a owl:Thing . t? a owl:Thing .
    gp = GraphPattern((
        (SOURCE_VAR, Variable('v1'), Variable('v2')),
        (TARGET_VAR, Variable('v3'), Variable('v2')),
    ))
    # get list of semantic association pairs and split in train and test sets
    semantic_associations = get_semantic_associations(
        fn='data/dbpedia_random_1000_uri_pairs.csv.gz',
        limit=100,
    )
    # assocs_train, assocs_test = split_training_test_set(
    #     semantic_associations
    # )
    # stps = tuple(sorted(assocs_train))
    stps = semantic_associations
    print(len(stps))

    triples = generate_triples(gp, stps)
    for t in triples:
        print(t)
def test_graph_pattern_connectedness():
    # test edge var connections
    gp = GraphPattern([
        (SOURCE_VAR, Variable('p'), Variable('v1')),
        (TARGET_VAR, Variable('p'), Variable('v2')),
    ])
    assert not gp.is_connected(), \
        "shouldn't be connected with nodes only: %s" % (gp,)
    assert gp.is_connected(via_edges=True), \
        "should be connected via edges: %s" % (gp,)
    gp = GraphPattern([
        (SOURCE_VAR, Variable('p'), Variable('v1')),
        (Variable('p'), Variable('v2'), TARGET_VAR),
    ])
    assert not gp.is_connected(), \
        "shouldn't be connected with nodes only: %s" % (gp,)
    assert gp.is_connected(via_edges=True), \
        "should be connected via edges: %s" % (gp,)
def test_simplify_pattern():
    gp = GraphPattern([(SOURCE_VAR, wikilink, TARGET_VAR)])
    res = mutate_simplify_pattern(gp)
    assert gp == res, 'should not simplify simple pattern'

    # test parallel single var edges
    gp_bloated = gp + [
        (SOURCE_VAR, Variable('v1'), TARGET_VAR),
    ]
    res = mutate_simplify_pattern(gp_bloated)
    assert res == gp, 'not simplified:\n%s' % res.to_sparql_select_query()
    gp_bloated += [
        (SOURCE_VAR, Variable('v2'), TARGET_VAR),
    ]
    res = mutate_simplify_pattern(gp_bloated)
    assert res == gp, 'not simplified:\n%s' % res.to_sparql_select_query()

    # test edges between fixed nodes
    gp += [
        (SOURCE_VAR, wikilink, dbp['City']),
        (TARGET_VAR, wikilink, dbp['Country']),
    ]
    gp_bloated = gp + [
        (dbp['City'], wikilink, dbp['Country']),
        (dbp['Country'], Variable('v2'), dbp['City']),
    ]
    res = mutate_simplify_pattern(gp_bloated)
    assert res == gp, 'not simplified:\n%s' % res.to_sparql_select_query()

    # test unrestricting leaves:
    gp_bloated = gp + [
        (SOURCE_VAR, Variable('v3'), Variable('v4')),
    ]
    res = mutate_simplify_pattern(gp_bloated)
    assert res == gp, 'not simplified:\n%s' % res.to_sparql_select_query()
    gp_bloated = gp + [
        (SOURCE_VAR, Variable('v3'), Variable('v4')),
        (Variable('v5'), Variable('v6'), Variable('v4')),
    ]
    res = mutate_simplify_pattern(gp_bloated)
    assert res == gp, 'not simplified:\n%s' % res.to_sparql_select_query()
    gp_bloated = gp + [
        (SOURCE_VAR, Variable('v3'), Variable('v4')),
        (Variable('v5'), Variable('v6'), Variable('v4')),
        (Variable('v4'), Variable('v7'), Variable('v8')),
    ]
    res = mutate_simplify_pattern(gp_bloated)
    assert res == gp, 'not simplified:\n%s' % res.to_sparql_select_query()

    # test leaves behind fixed nodes
    gp += [
        (SOURCE_VAR, wikilink, Variable('v4')),
    ]
    gp_bloated = gp + [
        (Variable('v5'), wikilink, dbp['Country']),
        (Variable('v5'), Variable('v6'), Variable('v7')),
    ]
    res = mutate_simplify_pattern(gp_bloated)
    assert res == gp, 'not simplified:\n%s' % res.to_sparql_select_query()

    # counter example of an advanced but restricting pattern:
    gp = gp + [
        (SOURCE_VAR, Variable('v3'), Variable('v4')),
        (Variable('v5'), Variable('v6'), Variable('v4')),
        (Variable('v4'), Variable('v7'), Variable('v8')),
        (TARGET_VAR, Variable('v3'), SOURCE_VAR),
        (dbp['City'], Variable('v6'), dbp['Country']),
        (dbp['Country'], Variable('v8'), dbp['City']),
    ]
    res = mutate_simplify_pattern(gp)
    assert res == gp, 'was simplified (bad):\n%s' % res.to_sparql_select_query(
    )

    # test atomic patterns:
    gp = GraphPattern([(SOURCE_VAR, Variable('v1'), Variable('v2'))])
    res = mutate_simplify_pattern(gp)
    assert res == gp, 'was simplified (bad):\n%s' % res.to_sparql_select_query(
    )
    gp = GraphPattern([
        (SOURCE_VAR, Variable('v1'), Variable('v2')),
        (SOURCE_VAR, Variable('v3'), Variable('v4')),
    ])
    res = mutate_simplify_pattern(gp)
    assert res == gp, 'was simplified (bad):\n%s' % res.to_sparql_select_query(
    )

    # test edge var connections
    gp = GraphPattern([
        (SOURCE_VAR, Variable('p'), Variable('v1')),
        (TARGET_VAR, Variable('p'), Variable('v2')),
    ])
    res = mutate_simplify_pattern(gp)
    assert res == gp, 'was simplified (bad):\n%s\nto\n%s' % (gp, res)
    gp2 = gp + [
        (Variable('v1'), Variable('v3'), Variable('v4')),
    ]
    res = mutate_simplify_pattern(gp2)
    assert res == gp, 'not simplified:\n%s\nto\n%s' % (gp2, res)
    gp = GraphPattern([
        (SOURCE_VAR, Variable('p'), Variable('v1')),
        (Variable('p'), Variable('v2'), TARGET_VAR),
    ])
    res = mutate_simplify_pattern(gp)
    assert res == gp, 'was simplified (bad):\n%s\nto\n%s' % (gp, res)
    gp2 = gp + [
        (Variable('p'), Variable('v3'), TARGET_VAR),
    ]
    res = mutate_simplify_pattern(gp2)
    assert res == gp, 'not simplified:\n%s\nto\n%s' % (gp2, res)
def test_mutate_increase_dist():
    gp = GraphPattern([(SOURCE_VAR, wikilink, TARGET_VAR)])
    res = mutate_increase_dist(gp)
    assert gp != res
    assert gp.diameter() + 1 == res.diameter()
    assert gp.vars_in_graph == {SOURCE_VAR, TARGET_VAR}
Example #19
0
def _dejsonify(pattern_str):
    return GraphPattern(
        [tuple([Variable(i) for i in t]) for t in json.loads(pattern_str)])
Example #20
0
def pattern_generator(
        length,
        loops=True,
        node_edge_joint=True,
        p_only_connected=True,
        source_target_edges=True,
        exclude_isomorphic=True,
        count_candidates_only=False,
):
    assert not source_target_edges or node_edge_joint, \
        'source_target_edges cannot be used without node_edge_joint'
    canonicalized_patterns = {}

    if node_edge_joint:
        # To be connected there are max 3 + 2 + 2 + 2 + ... vars for triples.
        # The first can be 3 different ones (including ?source and ?target, then
        # in each of the following triples at least one var has to be an old one
        possible_vars = [Variable('v%d' % i) for i in range((2 * length) - 1)]
        possible_nodes = possible_vars + [SOURCE_VAR, TARGET_VAR]
        if source_target_edges:
            possible_edges = possible_nodes
        else:
            possible_edges = possible_vars
    else:
        possible_var_nodes = [Variable('n%d' % i) for i in range(length - 1)]
        possible_nodes = possible_var_nodes + [SOURCE_VAR, TARGET_VAR]
        possible_edges = [Variable('e%d' % i) for i in range(length)]

    possible_triples = [
        (s, p, o)
        for s in possible_nodes
        for p in possible_edges
        for o in possible_nodes
    ]

    n_patterns = binom(len(possible_triples), length)
    logger.info(
        'generating %d possible patterns of length %d', n_patterns, length)
    if count_candidates_only:
        yield (n_patterns, None)
        return

    i = 0
    pid = 0
    for pid, pattern in enumerate(combinations(possible_triples, length)):
        gp = GraphPattern(pattern)

        # check that source and target are in gp:
        if not gp.complete():
            logger.debug(
                'excluded %d: source or target missing: %s', pid, gp)
            continue
        nodes = sorted(gp.nodes - {SOURCE_VAR, TARGET_VAR})
        edges = sorted(gp.edges - {SOURCE_VAR, TARGET_VAR})
        vars_ = sorted(gp.vars_in_graph - {SOURCE_VAR, TARGET_VAR})

        # check there are no skipped variables (nodes or edges)
        # noinspection PyUnboundLocalVariable
        if (
                (node_edge_joint and vars_ != possible_vars[:len(vars_)]) or
                (not node_edge_joint and (
                    nodes != possible_var_nodes[:len(nodes)] or
                    edges != possible_edges[:len(edges)]
                ))
        ):
            logger.debug('excluded %d: skipped var: %s', pid, gp)
            continue

        # check if nodes and edges are disjoint
        if not node_edge_joint and (gp.nodes & gp.edges):
            logger.debug('excluded %d: node-edge-joined: %s', pid, gp)
            continue

        # check for loops if necessary
        if not loops and any([s == o for s, p, o in gp]):
            logger.debug('excluded %d: loop: %s', pid, gp)
            continue

        # check that the pattern is connected
        if not gp.is_connected(via_edges=p_only_connected):
            logger.debug('excluded %d: not connected:\n%s', pid, gp)
            continue

        # exclude patterns which are isomorphic to already generated ones
        if exclude_isomorphic:
            cgp = canonicalize(gp)
            if cgp in canonicalized_patterns:
                logger.debug(
                    'excluded %d: isomorphic to %d:\n%sand\n%s',
                    pid,
                    canonicalized_patterns[cgp][0],
                    gp,
                    canonicalized_patterns[cgp][1]
                )
                continue
            else:
                canonicalized_patterns[cgp] = (pid, gp)
                gp = cgp
        i += 1
        logger.debug('generated pattern %d: %s', pid, gp)
        yield pid, gp
    assert pid + 1 == n_patterns
    logger.info(
        'found %d differing patterns out of %d possible of length %d',
        i, n_patterns, length
    )
    yield (n_patterns, None)
def test_mutate_increase_dist():
    gp = GraphPattern([(SOURCE_VAR, wikilink, TARGET_VAR)])
    res = mutate_increase_dist(gp)
    assert gp != res
    assert gp.diameter() + 1 == res.diameter()
    assert gp.vars_in_graph == {SOURCE_VAR, TARGET_VAR}
def pattern_generator(
    length,
    loops=True,
    node_edge_joint=True,
    p_only_connected=True,
    source_target_edges=True,
    exclude_isomorphic=True,
    count_candidates_only=False,
):
    assert not source_target_edges or node_edge_joint, \
        'source_target_edges cannot be used without node_edge_joint'
    canonicalized_patterns = {}

    if node_edge_joint:
        # To be connected there are max 3 + 2 + 2 + 2 + ... vars for triples.
        # The first can be 3 different ones (including ?source and ?target, then
        # in each of the following triples at least one var has to be an old one
        possible_vars = [Variable('v%d' % i) for i in range((2 * length) - 1)]
        possible_nodes = possible_vars + [SOURCE_VAR, TARGET_VAR]
        if source_target_edges:
            possible_edges = possible_nodes
        else:
            possible_edges = possible_vars
    else:
        possible_var_nodes = [Variable('n%d' % i) for i in range(length - 1)]
        possible_nodes = possible_var_nodes + [SOURCE_VAR, TARGET_VAR]
        possible_edges = [Variable('e%d' % i) for i in range(length)]

    possible_triples = [(s, p, o) for s in possible_nodes
                        for p in possible_edges for o in possible_nodes]

    n_patterns = binom(len(possible_triples), length)
    logger.info('generating %d possible patterns of length %d', n_patterns,
                length)
    if count_candidates_only:
        yield (n_patterns, None)
        return

    i = 0
    pid = 0
    for pid, pattern in enumerate(combinations(possible_triples, length)):
        gp = GraphPattern(pattern)

        # check that source and target are in gp:
        if not gp.complete():
            logger.debug('excluded %d: source or target missing: %s', pid, gp)
            continue
        nodes = sorted(gp.nodes - {SOURCE_VAR, TARGET_VAR})
        edges = sorted(gp.edges - {SOURCE_VAR, TARGET_VAR})
        vars_ = sorted(gp.vars_in_graph - {SOURCE_VAR, TARGET_VAR})

        # check there are no skipped variables (nodes or edges)
        # noinspection PyUnboundLocalVariable
        if ((node_edge_joint and vars_ != possible_vars[:len(vars_)])
                or (not node_edge_joint and
                    (nodes != possible_var_nodes[:len(nodes)]
                     or edges != possible_edges[:len(edges)]))):
            logger.debug('excluded %d: skipped var: %s', pid, gp)
            continue

        # check if nodes and edges are disjoint
        if not node_edge_joint and (gp.nodes & gp.edges):
            logger.debug('excluded %d: node-edge-joined: %s', pid, gp)
            continue

        # check for loops if necessary
        if not loops and any([s == o for s, p, o in gp]):
            logger.debug('excluded %d: loop: %s', pid, gp)
            continue

        # check that the pattern is connected
        if not gp.is_connected(via_edges=p_only_connected):
            logger.debug('excluded %d: not connected:\n%s', pid, gp)
            continue

        # exclude patterns which are isomorphic to already generated ones
        if exclude_isomorphic:
            cgp = canonicalize(gp)
            if cgp in canonicalized_patterns:
                logger.debug('excluded %d: isomorphic to %d:\n%sand\n%s', pid,
                             canonicalized_patterns[cgp][0], gp,
                             canonicalized_patterns[cgp][1])
                continue
            else:
                canonicalized_patterns[cgp] = (pid, gp)
                gp = cgp
        i += 1
        logger.debug('generated pattern %d: %s', pid, gp)
        yield pid, gp
    assert pid + 1 == n_patterns
    logger.info('found %d differing patterns out of %d possible of length %d',
                i, n_patterns, length)
    yield (n_patterns, None)
def patterns(
    length,
    loops=True,
    node_edge_joint=True,
    p_only_connected=True,
    source_target_edges=True,
    exclude_isomorphic=True,
    count_candidates_only=False,
):
    """Takes a numerical pattern and generates actual patterns from it."""
    assert not count_candidates_only or not exclude_isomorphic, \
        'count_candidates_only cannot be used with isomorphism check'
    assert not source_target_edges or node_edge_joint, \
        'source_target_edges cannot be used without node_edge_joint'

    canonicalized_patterns = {}

    pid = -1
    for c, num_pat in enumerate(
            numerical_patterns(
                length,
                loops=loops,
                node_edge_joint=node_edge_joint,
            )):
        assert (len(num_pat)) == length, 'too short: %s' % (num_pat, )
        flat_num_pat = [v for t in num_pat for v in t]
        all_numbers = set(flat_num_pat)

        if not p_only_connected:
            # Numerical patterns are always connected, but they might be
            # p_only_connected (e.g., 123 425).
            # Check that the pattern isn't p_only_connected, meaning that it's
            # also connected by nodes (e.g., 123 325).
            # Note that in case of node_edge_joint 123 245 is also considered
            # p_only_connected.
            if not nx.is_connected(to_nx_graph(num_pat)):
                logger.debug('excluded %d: not node connected:\n%s', c,
                             num_pat)
                continue

        if source_target_edges:
            all_numbers = sorted(all_numbers)
            numbers = all_numbers
        else:
            numbers = sorted(all_numbers - set(flat_num_pat[1::3]))
            all_numbers = sorted(all_numbers)

        if count_candidates_only:
            l = len(numbers)
            perms = l * (l - 1)
            pid += perms
            # yield pid, None  # way slower, rather show progress from here:
            if c % 100000 == 0:
                logger.info('pattern id: %d, vars: %d, permutations: %d', pid,
                            l, perms)
            continue

        for s, t in permutations(numbers, 2):
            pid += 1
            # source and target are mapped to numbers s and t
            # re-enumerate the leftover numbers to close "holes"
            leftover_numbers = [n for n in all_numbers if n != s and n != t]
            var_map = {
                n: Variable('v%d' % i)
                for i, n in enumerate(leftover_numbers)
            }
            var_map[s] = SOURCE_VAR
            var_map[t] = TARGET_VAR
            gp = GraphPattern(
                tuple([tuple([var_map[i] for i in trip]) for trip in num_pat]))
            assert len(gp) == length, \
                'gp too short: num %s\n%s' % (num_pat, gp)

            # exclude patterns which are isomorphic to already generated ones
            if exclude_isomorphic:
                cgp = canonicalize(gp)
                if cgp in canonicalized_patterns:
                    igp = canonicalized_patterns[cgp]
                    igp_numpat, igp_s, igp_t, igp_gp = igp
                    logger.debug(
                        'excluded isomorphic %s with ?s=%d, ?t=%d:\n'
                        'isomorphic to %s with ?s=%d, ?t=%d:\n'
                        '%sand\n%s',
                        num_pat,
                        s,
                        t,
                        igp_numpat,
                        igp_s,
                        igp_t,
                        gp,
                        igp_gp,
                    )
                    continue
                else:
                    canonicalized_patterns[cgp] = (num_pat, s, t, gp)
                    gp = cgp
            yield pid, gp
    yield pid + 1, None
def test_graph_pattern():
    g = Graph()
    g.add((URIRef('foo'), URIRef('bar'), Literal('bla')))
    g.add((URIRef('foo'), URIRef('baa'), Literal('bla')))
    g.add((URIRef('faa'), URIRef('boo'), Literal('blub')))

    gp = GraphPattern(g)
    gp = gp.replace({
        URIRef('foo'): Variable('a'),
        Literal('bla'): Variable('l'),
    })
    sparql = gp.to_sparql_select_query()
    expected = 'SELECT ?a ?l WHERE {\n' \
        ' ?a <baa> ?l .\n' \
        ' ?a <bar> ?l .\n' \
        ' <faa> <boo> "blub" .\n' \
        '}\n'
    assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql)
    sparql = gp.to_sparql_select_query()
    assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql)

    gp2 = gp.replace({URIRef('baa'): Variable('b')})
    sparql = gp2.to_sparql_select_query(bind={Variable('a'): URIRef('bound')})
    expected = 'SELECT ?a ?b ?l WHERE {\n' \
        ' ?a ?b ?l .\n' \
        ' ?a <bar> ?l .\n' \
        ' <faa> <boo> "blub" .\n' \
        ' FILTER(\n' \
        '  ?a=<bound>\n' \
        ' )\n' \
        '}\n'
    assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql)

    gp3 = GraphPattern(g,
                       source_node=URIRef('foo'),
                       target_node=Literal('bla'))
    expected = 'SELECT ?source ?target WHERE {\n' \
        ' ?source <baa> ?target .\n' \
        ' ?source <bar> ?target .\n' \
        ' <faa> <boo> "blub" .\n' \
        '}\n'
    sparql = gp3.to_sparql_select_query()
    assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql)

    gp4 = gp3.only_with([TARGET_VAR])
    expected = 'SELECT ?source ?target WHERE {\n' \
        ' ?source <baa> ?target .\n' \
        ' ?source <bar> ?target .\n' \
        '}\n'
    sparql = gp4.to_sparql_select_query()
    assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql)
    gp4_red = gp4.replace({URIRef('baa'): URIRef('bar')})
    assert len(gp4) > len(gp4_red), \
        "double edge should've been reduced: %s" % (gp4_red,)

    gp5 = gp3.only_with([URIRef('bar')])
    expected = 'SELECT ?source ?target WHERE {\n' \
        ' ?source <bar> ?target .\n' \
        '}\n'
    sparql = gp5.to_sparql_select_query()
    assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql)

    gp6 = gp + gp2
    expected = 'SELECT ?a ?b ?l WHERE {\n' \
        ' ?a ?b ?l .\n' \
        ' ?a <baa> ?l .\n' \
        ' ?a <bar> ?l .\n' \
        ' <faa> <boo> "blub" .\n' \
        '}\n'
    sparql = gp6.to_sparql_select_query()
    assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql)

    gp7 = gp - gp2
    expected = 'SELECT ?a ?l WHERE {\n' \
        ' ?a <baa> ?l .\n' \
        '}\n'
    sparql = gp7.to_sparql_select_query()
    assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql)

    gp8 = gp + ((TARGET_VAR, TARGET_VAR, TARGET_VAR), )
    expected = 'SELECT ?a ?l ?target WHERE {\n' \
        ' ?a <baa> ?l .\n' \
        ' ?a <bar> ?l .\n' \
        ' ?target ?target ?target .\n' \
        ' <faa> <boo> "blub" .\n' \
        '}\n'
    sparql = gp8.to_sparql_select_query()
    assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql)

    gp9 = gp - gp
    assert not bool(gp9), 'gp9 was not empty'
    gp9 = gp - list(gp)
    assert not bool(gp9), 'gp9 - list(gp9) was not empty'

    # test triples by identifier:
    tbi = gp8.triples_by_identifier()
    expected = {
        Variable('a'): {
            (Variable('a'), URIRef('baa'), Variable('l')),
            (Variable('a'), URIRef('bar'), Variable('l')),
        },
        Variable('l'): {
            (Variable('a'), URIRef('baa'), Variable('l')),
            (Variable('a'), URIRef('bar'), Variable('l')),
        },
        URIRef('baa'): {
            (Variable('a'), URIRef('baa'), Variable('l')),
        },
        URIRef('bar'): {
            (Variable('a'), URIRef('bar'), Variable('l')),
        },
        Variable('target'): {
            (Variable('target'), Variable('target'), Variable('target')),
        },
        URIRef('faa'): {
            (URIRef('faa'), URIRef('boo'), Literal('blub')),
        },
        URIRef('boo'): {
            (URIRef('faa'), URIRef('boo'), Literal('blub')),
        },
        Literal('blub'): {
            (URIRef('faa'), URIRef('boo'), Literal('blub')),
        },
    }
    assert tbi == expected, 'triples_by_identifier %s != %s' % (tbi, expected)
    tbn = gp8.triples_by_nodes({
        Variable('a'),
        Variable('target'),
        URIRef('notthere'),
        URIRef('faa'),
        URIRef('boo')
    })
    expected = {
        Variable('a'): {
            (Variable('a'), URIRef('baa'), Variable('l')),
            (Variable('a'), URIRef('bar'), Variable('l')),
        },
        Variable('target'): {
            (Variable('target'), Variable('target'), Variable('target')),
        },
        URIRef('faa'): {
            (URIRef('faa'), URIRef('boo'), Literal('blub')),
        },
        URIRef('notthere'): set(),
        URIRef('boo'): set(),
    }
    assert tbn == expected, 'triples_by_nodes %s != %s' % (tbn, expected)
    tbe = gp8.triples_by_edges(
        {URIRef('baa'), Variable('a'),
         Variable('?target')})
    expected = {
        URIRef('baa'): {
            (Variable('a'), URIRef('baa'), Variable('l')),
        },
        Variable('target'): {
            (Variable('target'), Variable('target'), Variable('target')),
        },
        Variable('a'): set(),
    }
    assert tbe == expected, 'triples_by_edges %s != %s' % (tbe, expected)
def test_graph_pattern_stats():
    gp = GraphPattern(
        (
            (URIRef('bar'), URIRef('pred1'), URIRef('s')),
            (URIRef('foo'), URIRef('pred2'), URIRef('t')),
            (URIRef('s'), URIRef('pred3'), URIRef('t')),
        ),
        source_node=URIRef('s'),
        target_node=URIRef('t'),
    )
    gp1 = GraphPattern(
        (
            (URIRef('bar'), URIRef('pred1'), URIRef('s2')),
            (URIRef('foo'), URIRef('pred2'), URIRef('t')),
            (URIRef('s2'), URIRef('pred3'), URIRef('t')),
            (URIRef('single'), URIRef('pred1'), URIRef('s2')),
        ),
        source_node=URIRef('s2'),
        target_node=URIRef('t'),
    )
    gps = GraphPatternStats()
    gps.add_graph_pattern(gp, URIRef('s'), URIRef('t'))
    identifiers = set(gp.identifier_counts(True))
    assert identifiers == {
        URIRef('bar'),
        URIRef('foo'),
        URIRef('pred1'),
        URIRef('pred2'),
        URIRef('pred3'),
    }, identifiers
    assert identifiers == set(gps.identifier_gt_node_count.keys())
    assert identifiers == set(gps.identifier_gt_pair_count.keys())
    assert set([(i, 1) for i in identifiers
                ]) == set(gps.identifier_gt_node_count.items())
    assert set([(i, 1) for i in identifiers
                ]) == set(gps.identifier_gt_pair_count.items())

    gps.add_graph_pattern(gp1, URIRef('s2'), URIRef('t'))
    assert set(gps.identifier_gt_node_count.keys()) == \
        identifiers | {URIRef('single')}
    assert set(gps.identifier_gt_pair_count.keys()) == \
        identifiers | {URIRef('single')}
    expected_node = {
        (URIRef('bar'), 2),
        (URIRef('foo'), 1),
        (URIRef('pred1'), 2),
        (URIRef('pred2'), 1),
        (URIRef('pred3'), 2),
        (URIRef('single'), 1),
    }
    res = set(gps.identifier_gt_node_count.items())
    assert expected_node == res, res
    expected_pair = {
        (URIRef('bar'), 2),
        (URIRef('foo'), 2),
        (URIRef('pred1'), 2),
        (URIRef('pred2'), 2),
        (URIRef('pred3'), 2),
        (URIRef('single'), 1),
    }
    res = set(gps.identifier_gt_pair_count.items())
    assert expected_pair == res, res

    tmp = gps.min_identifier_gt_node_occurrences(gp)
    assert tmp == 1, 'tmp: %d\n%s' % (tmp, gps)
    tmp = gps.min_identifier_gt_pair_occurrences(gp)
    assert tmp == 2, 'tmp: %d\n%s' % (tmp, gps)
Example #26
0
def pattern_generator(length, loops=True, exclude_isomorphic=True):
    canonicalized_patterns = {}
    possible_var_nodes = [Variable('n%d' % i) for i in range(length - 1)]
    possible_nodes = possible_var_nodes + [SOURCE_VAR, TARGET_VAR]
    possible_edges = [Variable('e%d' % i) for i in range(length)]

    possible_triples = [
        (s, p, o)
        for s in possible_nodes
        for p in possible_edges
        for o in possible_nodes
    ]

    n_patterns = binom(len(possible_triples), length)
    logger.info(
        'generating %d possible patterns of length %d', n_patterns, length)

    i = 0
    pid = 0
    for pid, pattern in enumerate(combinations(possible_triples, length)):
        gp = GraphPattern(pattern)

        # check that source and target are in gp:
        if not gp.complete():
            logger.debug(
                'excluded %d: source or target missing: %s', pid, gp)
            continue
        nodes = sorted(gp.nodes - {SOURCE_VAR, TARGET_VAR})
        edges = sorted(gp.edges)

        # check there are no skipped nodes, e.g., link to n2 picked but no n1
        if nodes != possible_var_nodes[:len(nodes)]:
            logger.debug('excluded %d: skipped node: %s', pid, gp)
            continue
        if edges != possible_edges[:len(edges)]:
            logger.debug('excluded %d: skipped edge: %s', pid, gp)
            continue

        # check for loops if necessary
        if not loops and any([s == o for s, p, o in gp]):
            logger.debug('excluded %d: loop: %s', pid, gp)
            continue

        # check that the pattern is connected
        if not gp.is_connected():
            logger.debug('excluded %d: not connected:\n%s', pid, gp)
            continue

        # exclude patterns which are isomorphic to already generated ones
        if exclude_isomorphic:
            cgp = canonicalize(gp)
            if cgp in canonicalized_patterns:
                logger.debug(
                    'excluded %d: isomorphic to %d:\n%sand\n%s',
                    pid,
                    canonicalized_patterns[cgp][0],
                    gp,
                    canonicalized_patterns[cgp][1]
                )
                continue
            else:
                canonicalized_patterns[cgp] = (pid, gp)
                gp = cgp
        i += 1
        logger.debug('generated pattern %d: %s', pid, gp)
        yield pid, gp
    assert pid + 1 == n_patterns
    logger.info(
        'found %d differing patterns out of %d possible of length %d',
        i, n_patterns, length
    )
    yield (n_patterns, None)
def test_mutate_merge_var():
    p = Variable('p')
    q = Variable('q')
    gp = GraphPattern([(SOURCE_VAR, p, TARGET_VAR)])
    res = mutate_merge_var(gp, 0)
    assert res == gp
    res = mutate_merge_var(gp, 1)
    assert res[0][1] in {SOURCE_VAR, TARGET_VAR}

    gp2 = gp + [(SOURCE_VAR, q, TARGET_VAR)]
    res = mutate_merge_var(gp2, 0)
    assert len(res) == 1, "?q must have become ?p or vice versa: %s" % len(res)
    assert res[0][1] in {p, q}
    a, b = False, False
    for i in range(100):
        res = mutate_merge_var(gp2, 1)
        if len(res) == 1:
            assert res[0][1] in {p, q}
            a = True
        else:
            # one of the edge vars must have become ?s or ?t
            assert {res[0][1], res[1][1]} & {SOURCE_VAR, TARGET_VAR}
            assert {res[0][1], res[1][1]} - {SOURCE_VAR, TARGET_VAR}
            b = True
        if a and b:
            break
    else:
        assert False, "merge never reached one of two cases: %s %s" % (a, b)

    gp2 = gp + [(q, p, TARGET_VAR)]
    a, b = False, False
    for i in range(100):
        res = mutate_merge_var(gp2, 0)
        if len(res) == 1:
            # q must have become ?source
            assert res == gp
            a = True
        else:
            # q became ?target
            assert res == gp + [(TARGET_VAR, p, TARGET_VAR)]
            b = True
        if a and b:
            break
    else:
        assert False, "merge never reached one of two cases: %s %s" % (a, b)

    cases = [False] * 4
    for i in range(100):
        res = mutate_merge_var(gp2, 1)
        if len(res) == 1:
            # q must have become ?source
            assert res == gp
            cases[0] = True
        else:
            # ?q became ?target or ?p, or ?p one of {?q, ?source, ?target}
            if res == gp + [(TARGET_VAR, p, TARGET_VAR)]:
                cases[1] = True
            elif res == gp + [(p, p, TARGET_VAR)]:
                cases[2] = True
            else:
                assert res[0][1] in {q, SOURCE_VAR, TARGET_VAR}
                cases[3] = True
        if all(cases):
            break
    else:
        assert False, "merge never reached one of the cases: %s" % cases
def test_graph_pattern_stats():
    gp = GraphPattern(
        (
            (URIRef('bar'), URIRef('pred1'), URIRef('s')),
            (URIRef('foo'), URIRef('pred2'), URIRef('t')),
            (URIRef('s'), URIRef('pred3'), URIRef('t')),
        ),
        source_node=URIRef('s'),
        target_node=URIRef('t'),
    )
    gp1 = GraphPattern(
        (
            (URIRef('bar'), URIRef('pred1'), URIRef('s2')),
            (URIRef('foo'), URIRef('pred2'), URIRef('t')),
            (URIRef('s2'), URIRef('pred3'), URIRef('t')),
            (URIRef('single'), URIRef('pred1'), URIRef('s2')),
        ),
        source_node=URIRef('s2'),
        target_node=URIRef('t'),
    )
    gps = GraphPatternStats()
    gps.add_graph_pattern(gp, URIRef('s'), URIRef('t'))
    identifiers = set(gp.identifier_counts(True))
    assert identifiers == {
        URIRef('bar'),
        URIRef('foo'),
        URIRef('pred1'),
        URIRef('pred2'),
        URIRef('pred3'),
    }, identifiers
    assert identifiers == set(gps.identifier_gt_node_count.keys())
    assert identifiers == set(gps.identifier_gt_pair_count.keys())
    assert set([(i, 1) for i in identifiers]) == set(
        gps.identifier_gt_node_count.items())
    assert set([(i, 1) for i in identifiers]) == set(
        gps.identifier_gt_pair_count.items())

    gps.add_graph_pattern(gp1, URIRef('s2'), URIRef('t'))
    assert set(gps.identifier_gt_node_count.keys()) == \
        identifiers | {URIRef('single')}
    assert set(gps.identifier_gt_pair_count.keys()) == \
        identifiers | {URIRef('single')}
    expected_node = {
        (URIRef('bar'), 2),
        (URIRef('foo'), 1),
        (URIRef('pred1'), 2),
        (URIRef('pred2'), 1),
        (URIRef('pred3'), 2),
        (URIRef('single'), 1),
    }
    res = set(gps.identifier_gt_node_count.items())
    assert expected_node == res, res
    expected_pair = {
        (URIRef('bar'), 2),
        (URIRef('foo'), 2),
        (URIRef('pred1'), 2),
        (URIRef('pred2'), 2),
        (URIRef('pred3'), 2),
        (URIRef('single'), 1),
    }
    res = set(gps.identifier_gt_pair_count.items())
    assert expected_pair == res, res

    tmp = gps.min_identifier_gt_node_occurrences(gp)
    assert tmp == 1, 'tmp: %d\n%s' % (tmp, gps)
    tmp = gps.min_identifier_gt_pair_occurrences(gp)
    assert tmp == 2, 'tmp: %d\n%s' % (tmp, gps)
Example #29
0
def pattern_generator(length, loops=True, exclude_isomorphic=True):
    canonicalized_patterns = {}
    possible_var_nodes = [Variable('n%d' % i) for i in range(length - 1)]
    possible_nodes = possible_var_nodes + [SOURCE_VAR, TARGET_VAR]
    possible_edges = [Variable('e%d' % i) for i in range(length)]

    possible_triples = [(s, p, o) for s in possible_nodes
                        for p in possible_edges for o in possible_nodes]

    n_patterns = binom(len(possible_triples), length)
    logger.info('generating %d possible patterns of length %d', n_patterns,
                length)

    i = 0
    pid = 0
    for pid, pattern in enumerate(combinations(possible_triples, length)):
        gp = GraphPattern(pattern)

        # check that source and target are in gp:
        if not gp.complete():
            logger.debug('excluded %d: source or target missing: %s', pid, gp)
            continue
        nodes = sorted(gp.nodes - {SOURCE_VAR, TARGET_VAR})
        edges = sorted(gp.edges)

        # check there are no skipped nodes, e.g., link to n2 picked but no n1
        if nodes != possible_var_nodes[:len(nodes)]:
            logger.debug('excluded %d: skipped node: %s', pid, gp)
            continue
        if edges != possible_edges[:len(edges)]:
            logger.debug('excluded %d: skipped edge: %s', pid, gp)
            continue

        # check for loops if necessary
        if not loops and any([s == o for s, p, o in gp]):
            logger.debug('excluded %d: loop: %s', pid, gp)
            continue

        # check that the pattern is connected
        if not gp.is_connected():
            logger.debug('excluded %d: not connected:\n%s', pid, gp)
            continue

        # exclude patterns which are isomorphic to already generated ones
        if exclude_isomorphic:
            cgp = canonicalize(gp)
            if cgp in canonicalized_patterns:
                logger.debug('excluded %d: isomorphic to %d:\n%sand\n%s', pid,
                             canonicalized_patterns[cgp][0], gp,
                             canonicalized_patterns[cgp][1])
                continue
            else:
                canonicalized_patterns[cgp] = (pid, gp)
                gp = cgp
        i += 1
        logger.debug('generated pattern %d: %s', pid, gp)
        yield pid, gp
    assert pid + 1 == n_patterns
    logger.info('found %d differing patterns out of %d possible of length %d',
                i, n_patterns, length)
    yield (n_patterns, None)
def test_graph_pattern():
    g = Graph()
    g.add((URIRef('foo'), URIRef('bar'), Literal('bla')))
    g.add((URIRef('foo'), URIRef('baa'), Literal('bla')))
    g.add((URIRef('faa'), URIRef('boo'), Literal('blub')))

    gp = GraphPattern(g)
    gp = gp.replace({
        URIRef('foo'): Variable('a'),
        Literal('bla'): Variable('l'),
    })
    sparql = gp.to_sparql_select_query()
    expected = 'SELECT ?a ?l WHERE {\n' \
        ' ?a <baa> ?l .\n' \
        ' ?a <bar> ?l .\n' \
        ' <faa> <boo> "blub" .\n' \
        '}\n'
    assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql)
    sparql = gp.to_sparql_select_query()
    assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql)

    gp2 = gp.replace({URIRef('baa'): Variable('b')})
    sparql = gp2.to_sparql_select_query(
        bind={Variable('a'): URIRef('bound')}
    )
    expected = 'SELECT ?a ?b ?l WHERE {\n' \
        ' ?a ?b ?l .\n' \
        ' ?a <bar> ?l .\n' \
        ' <faa> <boo> "blub" .\n' \
        ' FILTER(\n' \
        '  ?a=<bound>\n' \
        ' )\n' \
        '}\n'
    assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql)

    gp3 = GraphPattern(g, source_node=URIRef('foo'), target_node=Literal('bla'))
    expected = 'SELECT ?source ?target WHERE {\n' \
        ' ?source <baa> ?target .\n' \
        ' ?source <bar> ?target .\n' \
        ' <faa> <boo> "blub" .\n' \
        '}\n'
    sparql = gp3.to_sparql_select_query()
    assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql)

    gp4 = gp3.only_with([TARGET_VAR])
    expected = 'SELECT ?source ?target WHERE {\n' \
        ' ?source <baa> ?target .\n' \
        ' ?source <bar> ?target .\n' \
        '}\n'
    sparql = gp4.to_sparql_select_query()
    assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql)
    gp4_red = gp4.replace({URIRef('baa'): URIRef('bar')})
    assert len(gp4) > len(gp4_red), \
        "double edge should've been reduced: %s" % (gp4_red,)

    gp5 = gp3.only_with([URIRef('bar')])
    expected = 'SELECT ?source ?target WHERE {\n' \
        ' ?source <bar> ?target .\n' \
        '}\n'
    sparql = gp5.to_sparql_select_query()
    assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql)

    gp6 = gp + gp2
    expected = 'SELECT ?a ?b ?l WHERE {\n' \
        ' ?a ?b ?l .\n' \
        ' ?a <baa> ?l .\n' \
        ' ?a <bar> ?l .\n' \
        ' <faa> <boo> "blub" .\n' \
        '}\n'
    sparql = gp6.to_sparql_select_query()
    assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql)

    gp7 = gp - gp2
    expected = 'SELECT ?a ?l WHERE {\n' \
        ' ?a <baa> ?l .\n' \
        '}\n'
    sparql = gp7.to_sparql_select_query()
    assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql)

    gp8 = gp + ((TARGET_VAR, TARGET_VAR, TARGET_VAR),)
    expected = 'SELECT ?a ?l ?target WHERE {\n' \
        ' ?a <baa> ?l .\n' \
        ' ?a <bar> ?l .\n' \
        ' ?target ?target ?target .\n' \
        ' <faa> <boo> "blub" .\n' \
        '}\n'
    sparql = gp8.to_sparql_select_query()
    assert sparql == expected, "expected: %s\ngot: %s" % (expected, sparql)

    gp9 = gp - gp
    assert not bool(gp9), 'gp9 was not empty'
    gp9 = gp - list(gp)
    assert not bool(gp9), 'gp9 - list(gp9) was not empty'

    # test triples by identifier:
    tbi = gp8.triples_by_identifier()
    expected = {
        Variable('a'): {
            (Variable('a'), URIRef('baa'), Variable('l')),
            (Variable('a'), URIRef('bar'), Variable('l')),
        },
        Variable('l'): {
            (Variable('a'), URIRef('baa'), Variable('l')),
            (Variable('a'), URIRef('bar'), Variable('l')),
        },
        URIRef('baa'): {
            (Variable('a'), URIRef('baa'), Variable('l')),
        },
        URIRef('bar'): {
            (Variable('a'), URIRef('bar'), Variable('l')),
        },
        Variable('target'): {
            (Variable('target'), Variable('target'), Variable('target')),
        },
        URIRef('faa'): {
            (URIRef('faa'), URIRef('boo'), Literal('blub')),
        },
        URIRef('boo'): {
            (URIRef('faa'), URIRef('boo'), Literal('blub')),
        },
        Literal('blub'): {
            (URIRef('faa'), URIRef('boo'), Literal('blub')),
        },
    }
    assert tbi == expected, 'triples_by_identifier %s != %s' % (tbi, expected)
    tbn = gp8.triples_by_nodes({
        Variable('a'), Variable('target'), URIRef('notthere'), URIRef('faa'),
        URIRef('boo')
    })
    expected = {
        Variable('a'): {
            (Variable('a'), URIRef('baa'), Variable('l')),
            (Variable('a'), URIRef('bar'), Variable('l')),
        },
        Variable('target'): {
            (Variable('target'), Variable('target'), Variable('target')),
        },
        URIRef('faa'): {
            (URIRef('faa'), URIRef('boo'), Literal('blub')),
        },
        URIRef('notthere'): set(), URIRef('boo'): set(),
    }
    assert tbn == expected, 'triples_by_nodes %s != %s' % (tbn, expected)
    tbe = gp8.triples_by_edges({
        URIRef('baa'), Variable('a'), Variable('?target')
    })
    expected = {
        URIRef('baa'): {
            (Variable('a'), URIRef('baa'), Variable('l')),
        },
        Variable('target'): {
            (Variable('target'), Variable('target'), Variable('target')),
        },
        Variable('a'): set(),
    }
    assert tbe == expected, 'triples_by_edges %s != %s' % (tbe, expected)