Example #1
0
def get_supplier_solutions():
    """Execute the equivalent of the following SPARQL query, querying the Turtle file supplier.ttl:
        SELECT
            ?suppkey, ?nationkey
        FROM
            supplier
        WHERE {
            ?supplier <tpch:suppkey> ?suppkey .
            ?supplier <tpch:nationkey> ?nationkey .
        }
    """
    timer = FunctionTimer()
    short_prints = True

    suppliers = rdf.import_graph("supplier.ttl")
    timer.lap("suppliers", short=short_prints)

    # Find all triples that define a 'suppkey' (as predicate).
    bgp_suppkey_matches = clans.superstrict(suppliers, clans.from_dict({"p": rdflib.URIRef("tpch:suppkey")}))
    # Give the subject a name for later joining and object the name we need in the output.
    bgp_suppkey = clans.compose(bgp_suppkey_matches, clans.from_dict({"supplier": "s", "suppkey": "o"}))

    # Find all triples that define a 'nationkey' (as predicate) and give the subject a name for
    # later joining and object the name we need in the output.
    bgp_nationkey = clans.compose(
        clans.superstrict(suppliers, clans.from_dict({"p": rdflib.URIRef("tpch:nationkey")})),
        clans.from_dict({"supplier": "s", "nationkey": "o"}),
    )

    # Join the previous results on 'supplier' and project the columns we need.
    supplier_solutions = clans.project(clans.functional_cross_union(bgp_suppkey, bgp_nationkey), "nationkey", "suppkey")
    timer.end("supplier_solutions", short=short_prints)

    return supplier_solutions
Example #2
0
def get_customers_nations_projected(nations):
    """Execute the equivalent of the following SQL query, querying the CSV file customer.csv:
        SELECT
            custkey, nationkey, nationname
        FROM
            customer
        JOIN
            nations
        ON
            customer.nationkey = nations.nationkey
    """
    timer = FunctionTimer()
    short_prints = True

    customer_types = {'custkey': int, 'nationkey': int, 'acctbal': float}
    customers = csv.import_csv('customer.csv', customer_types)
    timer.lap('customers', short=short_prints)

    customers_nations = clans.cross_functional_union(customers, nations)
    timer.lap('customers_nations', short=short_prints)

    customers_nations_projected = clans.project(customers_nations, 'custkey',
                                                'nationkey', 'nationname')
    timer.end('customers_nations_projected', short=short_prints)

    return customers_nations_projected
Example #3
0
def get_orders_restricted_projected(startdate, enddate):
    """Execute the equivalent of the following SQL query, querying the CSV file orders.csv:
        SELECT
            orderkey, custkey
        FROM
            orders
        WHERE
            startdate <= orders.orderdate and orders.orderdate < enddate

    :param startdate: The lower boundary (inclusive) of the date range for the column 'orderdate'.
    :param enddate: The upper boundary (exclusive) of the date range for the column 'orderdate'.
    """
    timer = FunctionTimer()
    short_prints = True

    def read_date(date_str: str) -> datetime:
        return datetime.strptime(date_str, "%Y-%m-%d").date()

    orders_types = {"orderkey": int, "custkey": int, "orderdate": read_date, "totalprice": float, "shippriority": int}
    orders = csv.import_csv("orders.csv", orders_types)
    timer.lap("orders", short=short_prints)

    def select_dates(rel) -> bool:
        orderdate = rel("orderdate").value
        return (startdate <= orderdate) and (orderdate < enddate)

    orders_restricted = sets.restrict(orders, select_dates)
    timer.lap("orders_restricted", short=short_prints)

    orders_restricted_projected = clans.project(orders_restricted, "orderkey", "custkey")
    timer.end("orders_restricted_projected", short=short_prints)

    return orders_restricted_projected
Example #4
0
def get_customers_nations_projected(nations):
    """Execute the equivalent of the following SQL query, querying the CSV file customer.csv:
        SELECT
            custkey, nationkey, nationname
        FROM
            customer
        JOIN
            nations
        ON
            customer.nationkey = nations.nationkey
    """
    timer = FunctionTimer()
    short_prints = True

    customer_types = {"custkey": int, "nationkey": int, "acctbal": float}
    customers = csv.import_csv("customer.csv", customer_types)
    timer.lap("customers", short=short_prints)

    customers_nations = clans.functional_cross_union(customers, nations)
    timer.lap("customers_nations", short=short_prints)

    customers_nations_projected = clans.project(customers_nations, "custkey", "nationkey", "nationname")
    timer.end("customers_nations_projected", short=short_prints)

    return customers_nations_projected
Example #5
0
def get_customers_nations_projected(nations):
    """Execute the equivalent of the following SQL query, querying the CSV file customer.csv:
        SELECT
            custkey, nationkey, nationname
        FROM
            customer
        JOIN
            nations
        ON
            customer.nationkey = nations.nationkey
    """
    timer = FunctionTimer()
    short_prints = True

    customer_types = {'custkey': int, 'nationkey': int, 'acctbal': float}
    customers = csv.import_csv('customer.csv', customer_types)
    timer.lap('customers', short=short_prints)

    customers_nations = clans.cross_functional_union(customers, nations)
    timer.lap('customers_nations', short=short_prints)

    customers_nations_projected = clans.project(customers_nations,
                                                'custkey', 'nationkey', 'nationname')
    timer.end('customers_nations_projected', short=short_prints)

    return customers_nations_projected
Example #6
0
def get_orders_restricted_projected(startdate, enddate):
    """Execute the equivalent of the following SQL query, querying the CSV file orders.csv:
        SELECT
            orderkey, custkey
        FROM
            orders
        WHERE
            startdate <= orders.orderdate and orders.orderdate < enddate

    :param startdate: The lower boundary (inclusive) of the date range for the column 'orderdate'.
    :param enddate: The upper boundary (exclusive) of the date range for the column 'orderdate'.
    """
    timer = FunctionTimer()
    short_prints = True

    def read_date(date_str: str) -> datetime:
        return datetime.strptime(date_str, '%Y-%m-%d').date()

    orders_types = {
        'orderkey': int,
        'custkey': int,
        'orderdate': read_date,
        'totalprice': float,
        'shippriority': int
    }
    orders = csv.import_csv('orders.csv', orders_types)
    timer.lap('orders', short=short_prints)

    def select_dates(rel) -> bool:
        orderdate = rel('orderdate').value
        return (startdate <= orderdate) and (orderdate < enddate)

    orders_restricted = sets.restrict(orders, select_dates)
    timer.lap('orders_restricted', short=short_prints)

    orders_restricted_projected = clans.project(orders_restricted, 'orderkey',
                                                'custkey')
    timer.end('orders_restricted_projected', short=short_prints)

    return orders_restricted_projected
Example #7
0
def get_supplier_solutions():
    """Execute the equivalent of the following SPARQL query, querying the Turtle file supplier.ttl:
        SELECT
            ?suppkey, ?nationkey
        FROM
            supplier
        WHERE {
            ?supplier <tpch:suppkey> ?suppkey .
            ?supplier <tpch:nationkey> ?nationkey .
        }
    """
    timer = FunctionTimer()
    short_prints = True

    suppliers = rdf.import_graph('supplier.ttl')
    timer.lap('suppliers', short=short_prints)

    # Find all triples that define a 'suppkey' (as predicate).
    bgp_suppkey_matches = clans.superstrict(
        suppliers, clans.from_dict({'p': rdflib.URIRef('tpch:suppkey')}))
    # Give the subject a name for later joining and object the name we need in the output.
    bgp_suppkey = clans.compose(
        bgp_suppkey_matches, clans.from_dict({
            'supplier': 's',
            'suppkey': 'o'
        }))

    # Find all triples that define a 'nationkey' (as predicate) and give the subject a name for
    # later joining and object the name we need in the output.
    bgp_nationkey = clans.compose(
        clans.superstrict(
            suppliers, clans.from_dict({'p':
                                        rdflib.URIRef('tpch:nationkey')})),
        clans.from_dict({
            'supplier': 's',
            'nationkey': 'o'
        }))

    # Join the previous results on 'supplier' and project the columns we need.
    supplier_solutions = clans.project(
        clans.cross_functional_union(bgp_suppkey, bgp_nationkey), 'nationkey',
        'suppkey')
    timer.end('supplier_solutions', short=short_prints)

    return supplier_solutions
Example #8
0
def query5():
    # select
    #     nationname,
    #     sum(lineitem.extendedprice * (1 - lineitem.discount)) as revenue
    # from
    #     customer, orders, lineitem,   -- Loaded from CSV
    #     nation, region                -- Loaded from XML
    # where
    #     customer.custkey = orders.custkey
    #     and lineitem.orderkey = orders.orderkey
    #     and customer.nationkey = nation.nationkey
    #     and supplier.nationkey = nation.nationkey
    #     and nation.regionkey = region.regionkey
    #     and region.name = 'AMERICA'
    #     and orders.orderdate >= date '1996-01-01'
    #     and orders.orderdate < date '1996-01-01' + interval '1' year
    # group by
    #     n_name
    timer = FunctionTimer()
    short_prints = True

    # Join supplier_solutions and customers_nations_projected on 'nationkey'.
    result1 = clans.cross_functional_union(
        get_supplier_solutions(),
        get_customers_nations_projected(get_nations(region_name)))
    timer.lap('result1', short=short_prints)

    # Join result1 with orders_restricted_projected on 'custkey'.
    result2 = clans.cross_functional_union(
        result1, get_orders_restricted_projected(start_date, end_date))
    timer.lap('result2', short=short_prints)

    # Join result with lineitem on 'orderkey' and 'suppkey'.
    lineitem_types = {
        'orderkey': int,
        'suppkey': int,
        'extendedprice': float,
        'discount': float,
        'partkey': int,
        'linenumber': int,
        'quantity': int,
        'tax': float,
    }
    result3 = clans.cross_functional_union(
        result2, csv.import_csv('lineitem.csv', lineitem_types))
    timer.lap('result3', short=short_prints)

    # Add the 'revenue' column.
    def calc_revenue(rel):
        return Couplet(
            'revenue',
            rel('extendedprice').value * (1 - rel('discount').value))

    result4 = Set(
        relations.functional_add(rel, calc_revenue(rel)) for rel in result3)
    timer.lap('result4', short=short_prints)
    # Remove unnecessary columns.
    revenue_by_nations = clans.project(result4, 'revenue', 'nationname')

    # Partition the result on 'nationname'.
    revenue_grouped_by_nations = partition.partition(
        revenue_by_nations, lambda rel: rel('nationname'))
    timer.lap('revenue_grouped_by_nations', short=short_prints)

    # Almost generic aggregation function. (Handles 'normal' cases, but not all edge cases.)
    def aggregate(horde, group_left, aggregation_left, aggregate_func):
        aggregation = {}
        for clan in horde:
            aggregation_value = aggregate_func.identity
            for relation in clan:
                aggregation_value = aggregate_func(
                    aggregation_value,
                    relation(aggregation_left).value)
            first_relation = next(iter(clan))
            aggregation[first_relation(group_left)] = aggregation_value
        return Set([
            Set(Couplet(group_left, key),
                Couplet(aggregation_left, aggregation[key]))
            for key in aggregation
        ])

    # Our aggregation function (adding two numbers, identity is 0).
    def aggregate_sum(arg1, arg2):
        return arg1 + arg2

    aggregate_sum.identity = 0

    # Calculate the aggregation result.
    # noinspection PyTypeChecker
    query5_result = aggregate(revenue_grouped_by_nations, 'nationname',
                              'revenue', aggregate_sum)
    timer.end('query5_result')

    return query5_result
Example #9
0
def get_nations(regionname):
    """Execute the equivalent of the following XQuery statement and convert the XML into a clan:
        for $x in doc("regions.xml")/regions/region[name="MIDDLE EAST"]/nation
            return <nation>{$x/nationkey}<nationname>{data($x/name)}</nationname></nation>
    """
    timer = FunctionTimer()
    short_prints = True

    # Load the XML document. (Don't use multiplicity or sequence; our data doesn't require this.)
    regions = xml.import_xml('regions.xml', convert_numerics=True)
    timer.lap('regions', short=short_prints)

    # Get a clan where each region is a row.
    regions_clan = regions('regions')['region']
    timer.lap('regions_clan', short=short_prints)

    # Filter this clan down to the region of interest (name is `regionname`).
    target_region = clans.superstrict(regions_clan,
                                      clans.from_dict({'name': regionname}))
    timer.lap('target_region', short=short_prints)

    # Get all 'nation' lefts out of this clan and create a clan where every row is a nation's data.
    nations_clan = target_region['nation']
    timer.lap('nations_clan', short=short_prints)

    # Rename 'name' to 'nationname' and project 'nationkey' and 'nationname' (removing 'comment').
    nations = clans.compose(
        nations_clan,
        clans.from_dict({
            'nationkey': 'nationkey',
            'nationname': 'name'
        }))
    timer.end('nations', short=short_prints)

    return nations
Example #10
0
def query5():
    # select
    #     nationname,
    #     sum(lineitem.extendedprice * (1 - lineitem.discount)) as revenue
    # from
    #     customer, orders, lineitem,   -- Loaded from CSV
    #     nation, region                -- Loaded from XML
    # where
    #     customer.custkey = orders.custkey
    #     and lineitem.orderkey = orders.orderkey
    #     and customer.nationkey = nation.nationkey
    #     and supplier.nationkey = nation.nationkey
    #     and nation.regionkey = region.regionkey
    #     and region.name = 'AMERICA'
    #     and orders.orderdate >= date '1996-01-01'
    #     and orders.orderdate < date '1996-01-01' + interval '1' year
    # group by
    #     n_name
    timer = FunctionTimer()
    short_prints = True

    # Join supplier_solutions and customers_nations_projected on 'nationkey'.
    result1 = clans.functional_cross_union(
        get_supplier_solutions(), get_customers_nations_projected(get_nations(region_name))
    )
    timer.lap("result1", short=short_prints)

    # Join result1 with orders_restricted_projected on 'custkey'.
    result2 = clans.functional_cross_union(result1, get_orders_restricted_projected(start_date, end_date))
    timer.lap("result2", short=short_prints)

    # Join result with lineitem on 'orderkey' and 'suppkey'.
    lineitem_types = {
        "orderkey": int,
        "suppkey": int,
        "extendedprice": float,
        "discount": float,
        "partkey": int,
        "linenumber": int,
        "quantity": int,
        "tax": float,
    }
    result3 = clans.functional_cross_union(result2, csv.import_csv("lineitem.csv", lineitem_types))
    timer.lap("result3", short=short_prints)

    # Add the 'revenue' column.
    def calc_revenue(rel):
        return Couplet("revenue", rel("extendedprice").value * (1 - rel("discount").value))

    result4 = Set(relations.functional_add(rel, calc_revenue(rel)) for rel in result3)
    timer.lap("result4", short=short_prints)
    # Remove unnecessary columns.
    revenue_by_nations = clans.project(result4, "revenue", "nationname")

    # Partition the result on 'nationname'.
    revenue_grouped_by_nations = partition.partition(revenue_by_nations, lambda rel: rel("nationname"))
    timer.lap("revenue_grouped_by_nations", short=short_prints)

    # Almost generic aggregation function. (Handles 'normal' cases, but not all edge cases.)
    def aggregate(horde, group_left, aggregation_left, aggregate_func):
        aggregation = {}
        for clan in horde:
            aggregation_value = aggregate_func.identity
            for relation in clan:
                aggregation_value = aggregate_func(aggregation_value, relation(aggregation_left).value)
            first_relation = next(iter(clan))
            aggregation[first_relation(group_left)] = aggregation_value
        return Set([Set(Couplet(group_left, key), Couplet(aggregation_left, aggregation[key])) for key in aggregation])

    # Our aggregation function (adding two numbers, identity is 0).
    def aggregate_sum(arg1, arg2):
        return arg1 + arg2

    aggregate_sum.identity = 0

    # Calculate the aggregation result.
    # noinspection PyTypeChecker
    query5_result = aggregate(revenue_grouped_by_nations, "nationname", "revenue", aggregate_sum)
    timer.end("query5_result")

    return query5_result
Example #11
0
def get_nations(regionname):
    """Execute the equivalent of the following XQuery statement and convert the XML into a clan:
        for $x in doc("regions.xml")/regions/region[name="MIDDLE EAST"]/nation
            return <nation>{$x/nationkey}<nationname>{data($x/name)}</nationname></nation>
    """
    timer = FunctionTimer()
    short_prints = True

    # Load the XML document. (Don't use multiplicity or sequence; our data doesn't require this.)
    regions = xml.import_xml("regions.xml", convert_numerics=True)
    timer.lap("regions", short=short_prints)

    # Get a clan where each region is a row.
    regions_clan = regions("regions")["region"]
    timer.lap("regions_clan", short=short_prints)

    # Filter this clan down to the region of interest (name is `regionname`).
    target_region = clans.superstrict(regions_clan, clans.from_dict({"name": regionname}))
    timer.lap("target_region", short=short_prints)

    # Get all 'nation' lefts out of this clan and create a clan where every row is a nation's data.
    nations_clan = target_region["nation"]
    timer.lap("nations_clan", short=short_prints)

    # Rename 'name' to 'nationname' and project 'nationkey' and 'nationname' (removing 'comment').
    nations = clans.compose(nations_clan, clans.from_dict({"nationkey": "nationkey", "nationname": "name"}))
    timer.end("nations", short=short_prints)

    return nations