Example #1
0
async def test_filter_iterator_interrupt():
    expression = "?p = <http://schema.org/eligibleRegion>"
    iterator, card = hdtDoc.search(triple['subject'], triple['predicate'],
                                   triple['object'])
    scan = ProjectionIterator(ScanIterator(iterator, triple, card))
    iterator = FilterIterator(scan, expression)
    (results, saved, done, _) = await engine.execute(iterator, 10e-7, 2)
    assert len(results) <= 4
    for b in results:
        assert b['?p'] == 'http://schema.org/eligibleRegion'
        assert b['?o'] in [
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country0',
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country1',
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country4',
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country9'
        ]
    tmp = len(results)
    reloaded = load(saved.SerializeToString(),
                    DummyDataset(hdtDoc, 'watdiv100'))
    (results, saved, done, _) = await engine.execute(reloaded, 10e7)
    assert len(results) + tmp == 4
    for b in results:
        assert b['?p'] == 'http://schema.org/eligibleRegion'
        assert b['?o'] in [
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country0',
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country1',
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country4',
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country9'
        ]
    assert done
def load_projection(saved_plan, dataset):
    """Load a ProjectionIterator from a protobuf serialization"""
    default_graph = saved_plan.graph
    sourceField = saved_plan.WhichOneof('source')
    source = load(getattr(saved_plan, sourceField), dataset)
    values = saved_plan.values if len(saved_plan.values) > 0 else None
    return ProjectionIterator(source, dataset, default_graph, values)
async def test_filter_iterator_interrupt():
    context = { 'quantum': 10e-7, 'max_results': 10e7 }
    expression = "?p = <http://schema.org/eligibleRegion>"
    scan = ProjectionIterator(ScanIterator(hdtDoc, triple, context), context)
    iterator = FilterIterator(scan, expression, context)
    (results, saved, done, _) = await engine.execute(iterator, context)
    assert len(results) <= 4
    for b in results:
        assert b['?p'] == 'http://schema.org/eligibleRegion'
        assert b['?o'] in [
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country0',
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country1',
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country4',
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country9'
        ]
    tmp = len(results)
    context['quantum'] = 10e7
    reloaded = load(saved.SerializeToString(), DummyDataset(hdtDoc, 'watdiv100'), context)
    (results, saved, done, _) = await engine.execute(reloaded, context)
    assert len(results) + tmp == 4
    for b in results:
        assert b['?p'] == 'http://schema.org/eligibleRegion'
        assert b['?o'] in [
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country0',
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country1',
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country4',
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country9'
        ]
    assert done
async def test_operation_filter_iterator():
    context = { 'quantum': 10e7, 'max_results': 10e7 }
    expression = "10 = 5 * 2"
    scan = ProjectionIterator(ScanIterator(hdtDoc, triple, context), context)
    iterator = FilterIterator(scan, expression, context)
    (results, saved, done, _) = await engine.execute(iterator, context)
    assert len(results) == 9
Example #5
0
async def test_function_filter_iterator():
    expression = '?p = <http://purl.org/goodrelations/price> && isLiteral(?o) && !isNumeric(?o)'
    iterator, card = hdtDoc.search(triple['subject'], triple['predicate'],
                                   triple['object'])
    scan = ProjectionIterator(ScanIterator(iterator, triple, card))
    iterator = FilterIterator(scan, expression)
    (results, saved, done, _) = await engine.execute(iterator, math.inf)
    assert len(results) == 1
Example #6
0
async def test_operation_filter_iterator():
    expression = "10 = 5 * 2"
    iterator, card = hdtDoc.search(triple['subject'], triple['predicate'],
                                   triple['object'])
    scan = ProjectionIterator(ScanIterator(iterator, triple, card))
    iterator = FilterIterator(scan, expression)
    (results, saved, done, _) = await engine.execute(iterator, math.inf)
    assert len(results) == 9
Example #7
0
async def test_projection_read_stopped():
    iterator, card = hdtDoc.search(triple['subject'], triple['predicate'],
                                   triple['object'])
    scan = ScanIterator(iterator, triple, card)
    proj = ProjectionIterator(scan, ['?common'])
    (results, saved, done, _) = await engine.execute(proj, 10e-4)
    assert len(results) <= card
    for res in results:
        assert '?common' in res and '?s1' not in res
Example #8
0
async def test_rowbind_join_proj():
    iterator, card = hdtDoc.search(triple['subject'], triple['predicate'], triple['object'])
    scan=ScanIterator(iterator, triple, card)
    bind=BindIterator(scan,"URI(CONCAT('http://',MD5(CONCAT(STR(?s),STR('http://isa'),STR(?o)))))",'?z')
    join=IndexJoinIterator(bind,innerTriple,hdtDoc)
    proj=ProjectionIterator(join,['?z'])

    #print(proj)

    (results, saved, done, _) = await engine.execute(proj, 10e7)
    #print(results)
    assert len(results) > 0
    assert done
async def test_and_or_filter_iterator():
    context = { 'quantum': 10e7, 'max_results': 10e7 }
    expression = "?p = <http://schema.org/eligibleRegion> && (?o = <http://db.uwaterloo.ca/~galuc/wsdbm/Country0> || ?o = <http://db.uwaterloo.ca/~galuc/wsdbm/Country9>)"
    scan = ProjectionIterator(ScanIterator(hdtDoc, triple, context), context)
    iterator = FilterIterator(scan, expression, context)
    (results, saved, done, _) = await engine.execute(iterator, context)
    assert len(results) == 2
    for b in results:
        assert b['?p'] == 'http://schema.org/eligibleRegion'
        assert b['?o'] in [
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country0',
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country9'
        ]
Example #10
0
async def test_and_or_filter_iterator():
    expression = "?p = <http://schema.org/eligibleRegion> && (?o = <http://db.uwaterloo.ca/~galuc/wsdbm/Country0> || ?o = <http://db.uwaterloo.ca/~galuc/wsdbm/Country9>)"
    iterator, card = hdtDoc.search(triple['subject'], triple['predicate'],
                                   triple['object'])
    scan = ProjectionIterator(ScanIterator(iterator, triple, card))
    iterator = FilterIterator(scan, expression)
    (results, saved, done, _) = await engine.execute(iterator, math.inf)
    assert len(results) == 2
    for b in results:
        assert b['?p'] == 'http://schema.org/eligibleRegion'
        assert b['?o'] in [
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country0',
            'http://db.uwaterloo.ca/~galuc/wsdbm/Country9'
        ]
Example #11
0
def load_projection(saved_plan: SavedProjectionIterator, dataset: Dataset) -> PreemptableIterator:
    """Load a ProjectionIterator from a protobuf serialization.

    Args:
      * saved_plan: Saved query execution plan.
      * dataset: RDF dataset used to execute the plan.

    Returns:
      The pipeline of iterator used to continue query execution.
    """
    sourceField = saved_plan.WhichOneof('source')
    source = load(getattr(saved_plan, sourceField), dataset)
    values = saved_plan.values if len(saved_plan.values) > 0 else None
    return ProjectionIterator(source, values)
Example #12
0
def parse_query_node(node,
                     dataset,
                     current_graphs,
                     server_url,
                     cardinalities,
                     renaming_map=None):
    """
        Recursively parse node in the query logical plan to build a preemptable physical query execution plan.

        Args:
            * node - Node of the logical plan to parse (in rdflib format)
            * dataset - RDF dataset used to execute the query
            * current_graphs - List of IRI of the current RDF graph queried
            * server_url - URL of the SaGe server
            * cardinalities - Map<triple,integer> used to track triple patterns cardinalities
    """
    if node.name == 'SelectQuery':
        # in case of a FROM clause, set the new default graphs used
        graphs = current_graphs
        if node.datasetClause is not None:
            graphs = [
                format_graph_uri(format_term(graph_iri.default), server_url)
                for graph_iri in node.datasetClause
            ]
        return parse_query_node(node.p, dataset, graphs, server_url,
                                cardinalities)
    elif node.name == 'Project':
        query_vars = list(map(lambda t: t.n3(), node.PV))
        if node.p.name == 'AggregateJoin' or node.p.name == 'Extend':
            # forward projection variables, as we need them for parsing an AggregateJoin
            node.p['PV'] = query_vars
            return parse_query_node(node.p, dataset, current_graphs,
                                    server_url, cardinalities)
        child = parse_query_node(node.p, dataset, current_graphs, server_url,
                                 cardinalities)
        return ProjectionIterator(child, dataset, current_graphs[0],
                                  query_vars)
    elif node.name == 'BGP':
        # bgp_vars = node._vars
        triples = list(localize_triple(node.triples, current_graphs))
        # format triple patterns for the backend API
        patterns = []
        for triple in triples:
            graph_uri = triple[
                'graph'] if 'graph' in triple else current_graphs[0]
            graph = dataset.get_graph(graph_uri)
            patterns.append({
                'subject':
                triple['subject'] if triple['subject'].startswith('?') else
                graph.get_identifiant(triple['subject']),
                'predicate':
                triple['predicate'] if triple['predicate'].startswith('?') else
                graph.get_identifiant(triple['predicate']),
                'object':
                triple['object'] if triple['object'].startswith('?') else
                graph.get_identifiant(triple['object']),
                'graph':
                graph_uri
            })
        iterator, query_vars, c = build_left_plan(patterns, dataset,
                                                  current_graphs)
        # track cardinalities of every triple pattern
        cardinalities += c
        return iterator
    elif node.name == 'Union':
        left = parse_query_node(node.p1, dataset, current_graphs, server_url,
                                cardinalities)
        right = parse_query_node(node.p2, dataset, current_graphs, server_url,
                                 cardinalities)
        return BagUnionIterator(left, right)
    elif node.name == 'Filter':
        expression = parse_filter_expr(node.expr)
        iterator = parse_query_node(node.p, dataset, current_graphs,
                                    server_url, cardinalities)
        return FilterIterator(iterator, expression)
    elif node.name == 'Join':
        # only allow for joining BGPs from different GRAPH clauses
        triples = fetch_graph_triples(node.p1, current_graphs, server_url)
        triples += fetch_graph_triples(node.p2, current_graphs, server_url)
        iterator, query_vars, c = build_left_plan(triples, dataset,
                                                  current_graphs)
        # track cardinalities of every triple pattern
        cardinalities += c
        return iterator
    elif node.name == 'Extend':
        # remove all extend operators, as they are not needed
        current = node
        renaming = dict()
        while current.name == 'Extend':
            renaming[current.expr.n3()] = current.var.n3()
            current = current.p
        current['PV'] = node['PV']
        return parse_query_node(current,
                                dataset,
                                current_graphs,
                                server_url,
                                cardinalities,
                                renaming_map=renaming)
    elif node.name == 'AggregateJoin':
        groupby_variables = list()
        # build GROUP BY variables
        last_groupby_var = None
        if node.p.expr is None:  # case 1: no explicit group BY, so we group by all variables in the query
            last_groupby_var = list(node.p._vars)[0]
            # for variable in node.p._vars:
            #     groupby_variables.append(variable.n3())
            #     last_groupby_var = variable
        else:  # case 2: there is an explicit group by
            for variable in node.p.expr:
                groupby_variables.append(variable.n3())
                last_groupby_var = variable
        # build aggregators for evaluating SPARQL aggregations (if any)
        aggregators = list()
        for agg in node.A:
            if agg.vars == '*':
                agg.vars = last_groupby_var
            if agg.name != 'Aggregate_Sample':
                aggregators.append(build_aggregator(dataset, agg,
                                                    renaming_map))
        # build source iterator from child node
        source = parse_query_node(node.p.p, dataset, current_graphs,
                                  server_url, cardinalities)
        # add the GROUP BY operator (with aggregators) to the pipeline
        source = GroupByAggregator(source,
                                   groupby_variables,
                                   aggregators=aggregators,
                                   max_size=dataset.max_group_by_size)
        # add the projection to the pipeline, depending of the context
        return AggregatesProjectionIterator(source, dataset, current_graphs[0],
                                            node.PV)
    else:
        raise UnsupportedSPARQL("Unsupported SPARQL feature: {}".format(
            node.name))
Example #13
0
def parse_query_alt(node: dict,
                    dataset: Dataset,
                    current_graphs: List[str],
                    cardinalities: dict,
                    as_of: Optional[datetime] = None) -> PreemptableIterator:
    """Recursively parse node in the query logical plan to build a preemptable physical query execution plan.

    Args:
      * node: Node of the logical plan to parse (in rdflib format).
      * dataset: RDF dataset used to execute the query.
      * current_graphs: List of IRI of the current RDF graphs queried.
      * cardinalities: A dict used to track triple patterns cardinalities.
      * as_of: A timestamp used to perform all reads against a consistent version of the dataset. If `None`, use the latest version of the dataset, which does not guarantee snapshot isolation.

    Returns: An iterator used to evaluate the input node.

    Throws: `UnsupportedSPARQL` is the SPARQL query contains features not supported by the SaGe query engine.
    """
    if node.name == 'SelectQuery':
        # in case of a FROM clause, set the new default graphs used
        graphs = current_graphs
        if node.datasetClause is not None:
            graphs = [
                format_term(graph_iri.default)
                for graph_iri in node.datasetClause
            ]
        return parse_query_alt(node.p,
                               dataset,
                               graphs,
                               cardinalities,
                               as_of=as_of)
    elif node.name == 'ConstructQuery':
        graphs = current_graphs
        if node.datasetClause is not None:
            graphs = [
                format_term(graph_iri.default)
                for graph_iri in node.datasetClause
            ]
        child = parse_query_alt(node.p,
                                dataset,
                                graphs,
                                cardinalities,
                                as_of=as_of)
        return ConstructIterator(child,
                                 convert_construct_template(node.template))
    elif node.name == 'Reduced':
        child = parse_query_alt(node.p,
                                dataset,
                                current_graphs,
                                cardinalities,
                                as_of=as_of)
        return ReducedIterator(child)
    elif node.name == 'Project':
        query_vars = list(map(lambda t: '?' + str(t), node.PV))
        child = parse_query_alt(node.p,
                                dataset,
                                current_graphs,
                                cardinalities,
                                as_of=as_of)
        return ProjectionIterator(child, query_vars)
    elif node.name == 'BGP':
        # bgp_vars = node._vars
        triples = list(localize_triples(node.triples, current_graphs))
        iterator, query_vars, c = build_left_join_tree(triples,
                                                       dataset,
                                                       current_graphs,
                                                       as_of=as_of)
        # track cardinalities of every triple pattern
        cardinalities += c
        return iterator
    elif node.name == 'Union':
        left = parse_query_alt(node.p1,
                               dataset,
                               current_graphs,
                               cardinalities,
                               as_of=as_of)
        right = parse_query_alt(node.p2,
                                dataset,
                                current_graphs,
                                cardinalities,
                                as_of=as_of)
        return BagUnionIterator(left, right)
    elif node.name == 'Filter':
        expression = parse_filter_expr(node.expr)
        iterator = parse_query_alt(node.p,
                                   dataset,
                                   current_graphs,
                                   cardinalities,
                                   as_of=as_of)
        return FilterIterator(iterator, expression)
    elif node.name == 'Extend':
        bgp_iterator = parse_query_alt(node.p,
                                       dataset,
                                       current_graphs,
                                       cardinalities,
                                       as_of=as_of)
        expression = parse_bind_expr(node.expr)
        #print("expression:"+str(expression))
        if isinstance(bgp_iterator, EmptyIterator):
            return BindIterator(None, expression, '?' + node.var)
        else:
            return BindIterator(bgp_iterator, expression, '?' + node.var)
    elif node.name == 'Join':
        left = parse_query_alt(node.p1,
                               dataset,
                               current_graphs,
                               cardinalities,
                               as_of=as_of)
        if node.p2.name == 'BGP':
            triples = list(localize_triples(node.p2.triples, current_graphs))
            variables = set(map(lambda t: t.n3(), node.p1._vars))
            #print("Join P1 _vars"+str(variables))
            iterator, query_vars, c = continue_left_join_tree(
                left, variables, triples, dataset, current_graphs)
            cardinalities += c
            return iterator
        else:
            raise UnsupportedSPARQL(
                f"Join Unsupported SPARQL feature: {node.p2.name}")
    else:
        raise UnsupportedSPARQL(f"Unsupported SPARQL feature: {node.name}")
Example #14
0
def parse_query_node(node: dict,
                     dataset: Dataset,
                     current_graphs: List[str],
                     cardinalities: dict,
                     as_of: Optional[datetime] = None) -> PreemptableIterator:
    """Recursively parse node in the query logical plan to build a preemptable physical query execution plan.

    Args:
      * node: Node of the logical plan to parse (in rdflib format).
      * dataset: RDF dataset used to execute the query.
      * current_graphs: List of IRI of the current RDF graphs queried.
      * cardinalities: A dict used to track triple patterns cardinalities.
      * as_of: A timestamp used to perform all reads against a consistent version of the dataset. If `None`, use the latest version of the dataset, which does not guarantee snapshot isolation.

    Returns: An iterator used to evaluate the input node.

    Throws: `UnsupportedSPARQL` is the SPARQL query contains features not supported by the SaGe query engine.
    """
    if node.name == 'SelectQuery':
        # in case of a FROM clause, set the new default graphs used
        graphs = current_graphs
        if node.datasetClause is not None:
            graphs = [
                format_term(graph_iri.default)
                for graph_iri in node.datasetClause
            ]
        return parse_query_node(node.p,
                                dataset,
                                graphs,
                                cardinalities,
                                as_of=as_of)
    elif node.name == 'Project':
        query_vars = list(map(lambda t: '?' + str(t), node.PV))
        child = parse_query_node(node.p,
                                 dataset,
                                 current_graphs,
                                 cardinalities,
                                 as_of=as_of)
        return ProjectionIterator(child, query_vars)
    elif node.name == 'BGP':
        # bgp_vars = node._vars
        triples = list(localize_triples(node.triples, current_graphs))
        iterator, query_vars, c = build_left_join_tree(triples,
                                                       dataset,
                                                       current_graphs,
                                                       as_of=as_of)
        # track cardinalities of every triple pattern
        cardinalities += c
        return iterator
    elif node.name == 'Union':
        left = parse_query_node(node.p1,
                                dataset,
                                current_graphs,
                                cardinalities,
                                as_of=as_of)
        right = parse_query_node(node.p2,
                                 dataset,
                                 current_graphs,
                                 cardinalities,
                                 as_of=as_of)
        return BagUnionIterator(left, right)
    elif node.name == 'Filter':
        expression = parse_filter_expr(node.expr)
        iterator = parse_query_node(node.p,
                                    dataset,
                                    current_graphs,
                                    cardinalities,
                                    as_of=as_of)
        return FilterIterator(iterator, expression)
    elif node.name == 'Join':
        # only allow for joining BGPs from different GRAPH clauses
        triples = get_triples_from_graph(
            node.p1, current_graphs) + get_triples_from_graph(
                node.p2, current_graphs)
        iterator, query_vars, c = build_left_join_tree(triples, dataset,
                                                       current_graphs)
        # track cardinalities of every triple pattern
        cardinalities += c
        return iterator
    else:
        raise UnsupportedSPARQL(f"Unsupported SPARQL feature: {node.name}")
Example #15
0
def build_join_plan(bgp, dataset, default_graph):
    """Build a join plan with a projection at the end"""
    iterator, query_vars, cardinalities = build_left_plan(
        bgp, dataset, default_graph)
    return ProjectionIterator(iterator, dataset, default_graph[0],
                              query_vars), cardinalities