def get_orders_restricted_projected(startdate, enddate): """Execute the equivalent of the following SQL query, querying the CSV file orders.csv: SELECT orderkey, custkey FROM orders WHERE startdate <= orders.orderdate and orders.orderdate < enddate :param startdate: The lower boundary (inclusive) of the date range for the column 'orderdate'. :param enddate: The upper boundary (exclusive) of the date range for the column 'orderdate'. """ timer = FunctionTimer() short_prints = True def read_date(date_str: str) -> datetime: return datetime.strptime(date_str, "%Y-%m-%d").date() orders_types = {"orderkey": int, "custkey": int, "orderdate": read_date, "totalprice": float, "shippriority": int} orders = csv.import_csv("orders.csv", orders_types) timer.lap("orders", short=short_prints) def select_dates(rel) -> bool: orderdate = rel("orderdate").value return (startdate <= orderdate) and (orderdate < enddate) orders_restricted = sets.restrict(orders, select_dates) timer.lap("orders_restricted", short=short_prints) orders_restricted_projected = clans.project(orders_restricted, "orderkey", "custkey") timer.end("orders_restricted_projected", short=short_prints) return orders_restricted_projected
def get_customers_nations_projected(nations): """Execute the equivalent of the following SQL query, querying the CSV file customer.csv: SELECT custkey, nationkey, nationname FROM customer JOIN nations ON customer.nationkey = nations.nationkey """ timer = FunctionTimer() short_prints = True customer_types = {'custkey': int, 'nationkey': int, 'acctbal': float} customers = csv.import_csv('customer.csv', customer_types) timer.lap('customers', short=short_prints) customers_nations = clans.cross_functional_union(customers, nations) timer.lap('customers_nations', short=short_prints) customers_nations_projected = clans.project(customers_nations, 'custkey', 'nationkey', 'nationname') timer.end('customers_nations_projected', short=short_prints) return customers_nations_projected
def get_customers_nations_projected(nations): """Execute the equivalent of the following SQL query, querying the CSV file customer.csv: SELECT custkey, nationkey, nationname FROM customer JOIN nations ON customer.nationkey = nations.nationkey """ timer = FunctionTimer() short_prints = True customer_types = {"custkey": int, "nationkey": int, "acctbal": float} customers = csv.import_csv("customer.csv", customer_types) timer.lap("customers", short=short_prints) customers_nations = clans.functional_cross_union(customers, nations) timer.lap("customers_nations", short=short_prints) customers_nations_projected = clans.project(customers_nations, "custkey", "nationkey", "nationname") timer.end("customers_nations_projected", short=short_prints) return customers_nations_projected
def get_supplier_solutions(): """Execute the equivalent of the following SPARQL query, querying the Turtle file supplier.ttl: SELECT ?suppkey, ?nationkey FROM supplier WHERE { ?supplier <tpch:suppkey> ?suppkey . ?supplier <tpch:nationkey> ?nationkey . } """ timer = FunctionTimer() short_prints = True suppliers = rdf.import_graph("supplier.ttl") timer.lap("suppliers", short=short_prints) # Find all triples that define a 'suppkey' (as predicate). bgp_suppkey_matches = clans.superstrict(suppliers, clans.from_dict({"p": rdflib.URIRef("tpch:suppkey")})) # Give the subject a name for later joining and object the name we need in the output. bgp_suppkey = clans.compose(bgp_suppkey_matches, clans.from_dict({"supplier": "s", "suppkey": "o"})) # Find all triples that define a 'nationkey' (as predicate) and give the subject a name for # later joining and object the name we need in the output. bgp_nationkey = clans.compose( clans.superstrict(suppliers, clans.from_dict({"p": rdflib.URIRef("tpch:nationkey")})), clans.from_dict({"supplier": "s", "nationkey": "o"}), ) # Join the previous results on 'supplier' and project the columns we need. supplier_solutions = clans.project(clans.functional_cross_union(bgp_suppkey, bgp_nationkey), "nationkey", "suppkey") timer.end("supplier_solutions", short=short_prints) return supplier_solutions
def get_supplier_solutions(): """Execute the equivalent of the following SPARQL query, querying the Turtle file supplier.ttl: SELECT ?suppkey, ?nationkey FROM supplier WHERE { ?supplier <tpch:suppkey> ?suppkey . ?supplier <tpch:nationkey> ?nationkey . } """ timer = FunctionTimer() short_prints = True suppliers = rdf.import_graph('supplier.ttl') timer.lap('suppliers', short=short_prints) # Find all triples that define a 'suppkey' (as predicate). bgp_suppkey_matches = clans.superstrict( suppliers, clans.from_dict({'p': rdflib.URIRef('tpch:suppkey')})) # Give the subject a name for later joining and object the name we need in the output. bgp_suppkey = clans.compose( bgp_suppkey_matches, clans.from_dict({ 'supplier': 's', 'suppkey': 'o' })) # Find all triples that define a 'nationkey' (as predicate) and give the subject a name for # later joining and object the name we need in the output. bgp_nationkey = clans.compose( clans.superstrict( suppliers, clans.from_dict({'p': rdflib.URIRef('tpch:nationkey')})), clans.from_dict({ 'supplier': 's', 'nationkey': 'o' })) # Join the previous results on 'supplier' and project the columns we need. supplier_solutions = clans.project( clans.cross_functional_union(bgp_suppkey, bgp_nationkey), 'nationkey', 'suppkey') timer.end('supplier_solutions', short=short_prints) return supplier_solutions
def get_orders_restricted_projected(startdate, enddate): """Execute the equivalent of the following SQL query, querying the CSV file orders.csv: SELECT orderkey, custkey FROM orders WHERE startdate <= orders.orderdate and orders.orderdate < enddate :param startdate: The lower boundary (inclusive) of the date range for the column 'orderdate'. :param enddate: The upper boundary (exclusive) of the date range for the column 'orderdate'. """ timer = FunctionTimer() short_prints = True def read_date(date_str: str) -> datetime: return datetime.strptime(date_str, '%Y-%m-%d').date() orders_types = { 'orderkey': int, 'custkey': int, 'orderdate': read_date, 'totalprice': float, 'shippriority': int } orders = csv.import_csv('orders.csv', orders_types) timer.lap('orders', short=short_prints) def select_dates(rel) -> bool: orderdate = rel('orderdate').value return (startdate <= orderdate) and (orderdate < enddate) orders_restricted = sets.restrict(orders, select_dates) timer.lap('orders_restricted', short=short_prints) orders_restricted_projected = clans.project(orders_restricted, 'orderkey', 'custkey') timer.end('orders_restricted_projected', short=short_prints) return orders_restricted_projected
def project(clan: 'PP(M x M)', *lefts) -> 'PP(M x M)': clan = clans.project(clan, *lefts) return clan
def query5(): # select # nationname, # sum(lineitem.extendedprice * (1 - lineitem.discount)) as revenue # from # customer, orders, lineitem, -- Loaded from CSV # nation, region -- Loaded from XML # where # customer.custkey = orders.custkey # and lineitem.orderkey = orders.orderkey # and customer.nationkey = nation.nationkey # and supplier.nationkey = nation.nationkey # and nation.regionkey = region.regionkey # and region.name = 'AMERICA' # and orders.orderdate >= date '1996-01-01' # and orders.orderdate < date '1996-01-01' + interval '1' year # group by # n_name timer = FunctionTimer() short_prints = True # Join supplier_solutions and customers_nations_projected on 'nationkey'. result1 = clans.cross_functional_union( get_supplier_solutions(), get_customers_nations_projected(get_nations(region_name))) timer.lap('result1', short=short_prints) # Join result1 with orders_restricted_projected on 'custkey'. result2 = clans.cross_functional_union( result1, get_orders_restricted_projected(start_date, end_date)) timer.lap('result2', short=short_prints) # Join result with lineitem on 'orderkey' and 'suppkey'. lineitem_types = { 'orderkey': int, 'suppkey': int, 'extendedprice': float, 'discount': float, 'partkey': int, 'linenumber': int, 'quantity': int, 'tax': float, } result3 = clans.cross_functional_union( result2, csv.import_csv('lineitem.csv', lineitem_types)) timer.lap('result3', short=short_prints) # Add the 'revenue' column. def calc_revenue(rel): return Couplet( 'revenue', rel('extendedprice').value * (1 - rel('discount').value)) result4 = Set( relations.functional_add(rel, calc_revenue(rel)) for rel in result3) timer.lap('result4', short=short_prints) # Remove unnecessary columns. revenue_by_nations = clans.project(result4, 'revenue', 'nationname') # Partition the result on 'nationname'. revenue_grouped_by_nations = partition.partition( revenue_by_nations, lambda rel: rel('nationname')) timer.lap('revenue_grouped_by_nations', short=short_prints) # Almost generic aggregation function. (Handles 'normal' cases, but not all edge cases.) def aggregate(horde, group_left, aggregation_left, aggregate_func): aggregation = {} for clan in horde: aggregation_value = aggregate_func.identity for relation in clan: aggregation_value = aggregate_func( aggregation_value, relation(aggregation_left).value) first_relation = next(iter(clan)) aggregation[first_relation(group_left)] = aggregation_value return Set([ Set(Couplet(group_left, key), Couplet(aggregation_left, aggregation[key])) for key in aggregation ]) # Our aggregation function (adding two numbers, identity is 0). def aggregate_sum(arg1, arg2): return arg1 + arg2 aggregate_sum.identity = 0 # Calculate the aggregation result. # noinspection PyTypeChecker query5_result = aggregate(revenue_grouped_by_nations, 'nationname', 'revenue', aggregate_sum) timer.end('query5_result') return query5_result
print('america_region_name:\n' + mo_to_str(america_region_name)) america_region = superstrict(regions, america_region_name) print('america_region:\n' + mo_to_str(america_region)) america_nations_names = america_region['nation']['name'] print('america_nations_names:\n' + mo_to_str(america_nations_names)) # Get all region key and nation name pairs # for $x in doc("regions.xml")/regions/region/nation/name return <pair>{$x/../../regionkey}{$x}</pair> from algebraixlib.mathobjects import Set from algebraixlib.algebras.clans import project, cross_union from algebraixlib.algebras.sets import union region_key_nation_name_pairs_accumulator = Set() for region in regions: region_key = project(Set(region), 'regionkey') print('region_key:\n' + mo_to_str(region_key)) region_nations = region['nation'] print('region_nations:\n' + mo_to_str(region_nations)) region_nation_names = project(region_nations, 'name') print('region_nation_names:\n' + mo_to_str(region_nation_names)) region_key_nation_name_pairs = cross_union(region_key, region_nation_names) print('region_key_nation_name_pairs:\n' + mo_to_str(region_key_nation_name_pairs)) region_key_nation_name_pairs_accumulator = union(region_key_nation_name_pairs_accumulator, region_key_nation_name_pairs) print('region_key_nation_name_pairs_accumulator:\n' + mo_to_str(region_key_nation_name_pairs_accumulator)) print('region_key_nation_name_pairs_accumulator:\n' + mo_to_str(region_key_nation_name_pairs_accumulator))
print('america_region_name:\n' + mo_to_str(america_region_name)) america_region = superstrict(regions, america_region_name) print('america_region:\n' + mo_to_str(america_region)) america_nations_names = america_region['nation']['name'] print('america_nations_names:\n' + mo_to_str(america_nations_names)) # Get all region key and nation name pairs # for $x in doc("regions.xml")/regions/region/nation/name return <pair>{$x/../../regionkey}{$x}</pair> from algebraixlib.mathobjects import Set from algebraixlib.algebras.clans import project, cross_union from algebraixlib.algebras.sets import union region_key_nation_name_pairs_accumulator = Set() for region in regions: region_key = project(Set(region), 'regionkey') print('region_key:\n' + mo_to_str(region_key)) region_nations = region['nation'] print('region_nations:\n' + mo_to_str(region_nations)) region_nation_names = project(region_nations, 'name') print('region_nation_names:\n' + mo_to_str(region_nation_names)) region_key_nation_name_pairs = cross_union(region_key, region_nation_names) print('region_key_nation_name_pairs:\n' + mo_to_str(region_key_nation_name_pairs)) region_key_nation_name_pairs_accumulator = union( region_key_nation_name_pairs_accumulator, region_key_nation_name_pairs) print('region_key_nation_name_pairs_accumulator:\n' +
def query5(): # select # nationname, # sum(lineitem.extendedprice * (1 - lineitem.discount)) as revenue # from # customer, orders, lineitem, -- Loaded from CSV # nation, region -- Loaded from XML # where # customer.custkey = orders.custkey # and lineitem.orderkey = orders.orderkey # and customer.nationkey = nation.nationkey # and supplier.nationkey = nation.nationkey # and nation.regionkey = region.regionkey # and region.name = 'AMERICA' # and orders.orderdate >= date '1996-01-01' # and orders.orderdate < date '1996-01-01' + interval '1' year # group by # n_name timer = FunctionTimer() short_prints = True # Join supplier_solutions and customers_nations_projected on 'nationkey'. result1 = clans.functional_cross_union( get_supplier_solutions(), get_customers_nations_projected(get_nations(region_name)) ) timer.lap("result1", short=short_prints) # Join result1 with orders_restricted_projected on 'custkey'. result2 = clans.functional_cross_union(result1, get_orders_restricted_projected(start_date, end_date)) timer.lap("result2", short=short_prints) # Join result with lineitem on 'orderkey' and 'suppkey'. lineitem_types = { "orderkey": int, "suppkey": int, "extendedprice": float, "discount": float, "partkey": int, "linenumber": int, "quantity": int, "tax": float, } result3 = clans.functional_cross_union(result2, csv.import_csv("lineitem.csv", lineitem_types)) timer.lap("result3", short=short_prints) # Add the 'revenue' column. def calc_revenue(rel): return Couplet("revenue", rel("extendedprice").value * (1 - rel("discount").value)) result4 = Set(relations.functional_add(rel, calc_revenue(rel)) for rel in result3) timer.lap("result4", short=short_prints) # Remove unnecessary columns. revenue_by_nations = clans.project(result4, "revenue", "nationname") # Partition the result on 'nationname'. revenue_grouped_by_nations = partition.partition(revenue_by_nations, lambda rel: rel("nationname")) timer.lap("revenue_grouped_by_nations", short=short_prints) # Almost generic aggregation function. (Handles 'normal' cases, but not all edge cases.) def aggregate(horde, group_left, aggregation_left, aggregate_func): aggregation = {} for clan in horde: aggregation_value = aggregate_func.identity for relation in clan: aggregation_value = aggregate_func(aggregation_value, relation(aggregation_left).value) first_relation = next(iter(clan)) aggregation[first_relation(group_left)] = aggregation_value return Set([Set(Couplet(group_left, key), Couplet(aggregation_left, aggregation[key])) for key in aggregation]) # Our aggregation function (adding two numbers, identity is 0). def aggregate_sum(arg1, arg2): return arg1 + arg2 aggregate_sum.identity = 0 # Calculate the aggregation result. # noinspection PyTypeChecker query5_result = aggregate(revenue_grouped_by_nations, "nationname", "revenue", aggregate_sum) timer.end("query5_result") return query5_result