예제 #1
0
def get_orders_restricted_projected(startdate, enddate):
    """Execute the equivalent of the following SQL query, querying the CSV file orders.csv:
        SELECT
            orderkey, custkey
        FROM
            orders
        WHERE
            startdate <= orders.orderdate and orders.orderdate < enddate

    :param startdate: The lower boundary (inclusive) of the date range for the column 'orderdate'.
    :param enddate: The upper boundary (exclusive) of the date range for the column 'orderdate'.
    """
    timer = FunctionTimer()
    short_prints = True

    def read_date(date_str: str) -> datetime:
        return datetime.strptime(date_str, "%Y-%m-%d").date()

    orders_types = {"orderkey": int, "custkey": int, "orderdate": read_date, "totalprice": float, "shippriority": int}
    orders = csv.import_csv("orders.csv", orders_types)
    timer.lap("orders", short=short_prints)

    def select_dates(rel) -> bool:
        orderdate = rel("orderdate").value
        return (startdate <= orderdate) and (orderdate < enddate)

    orders_restricted = sets.restrict(orders, select_dates)
    timer.lap("orders_restricted", short=short_prints)

    orders_restricted_projected = clans.project(orders_restricted, "orderkey", "custkey")
    timer.end("orders_restricted_projected", short=short_prints)

    return orders_restricted_projected
예제 #2
0
def get_customers_nations_projected(nations):
    """Execute the equivalent of the following SQL query, querying the CSV file customer.csv:
        SELECT
            custkey, nationkey, nationname
        FROM
            customer
        JOIN
            nations
        ON
            customer.nationkey = nations.nationkey
    """
    timer = FunctionTimer()
    short_prints = True

    customer_types = {'custkey': int, 'nationkey': int, 'acctbal': float}
    customers = csv.import_csv('customer.csv', customer_types)
    timer.lap('customers', short=short_prints)

    customers_nations = clans.cross_functional_union(customers, nations)
    timer.lap('customers_nations', short=short_prints)

    customers_nations_projected = clans.project(customers_nations, 'custkey',
                                                'nationkey', 'nationname')
    timer.end('customers_nations_projected', short=short_prints)

    return customers_nations_projected
예제 #3
0
def get_customers_nations_projected(nations):
    """Execute the equivalent of the following SQL query, querying the CSV file customer.csv:
        SELECT
            custkey, nationkey, nationname
        FROM
            customer
        JOIN
            nations
        ON
            customer.nationkey = nations.nationkey
    """
    timer = FunctionTimer()
    short_prints = True

    customer_types = {"custkey": int, "nationkey": int, "acctbal": float}
    customers = csv.import_csv("customer.csv", customer_types)
    timer.lap("customers", short=short_prints)

    customers_nations = clans.functional_cross_union(customers, nations)
    timer.lap("customers_nations", short=short_prints)

    customers_nations_projected = clans.project(customers_nations, "custkey", "nationkey", "nationname")
    timer.end("customers_nations_projected", short=short_prints)

    return customers_nations_projected
예제 #4
0
def get_supplier_solutions():
    """Execute the equivalent of the following SPARQL query, querying the Turtle file supplier.ttl:
        SELECT
            ?suppkey, ?nationkey
        FROM
            supplier
        WHERE {
            ?supplier <tpch:suppkey> ?suppkey .
            ?supplier <tpch:nationkey> ?nationkey .
        }
    """
    timer = FunctionTimer()
    short_prints = True

    suppliers = rdf.import_graph("supplier.ttl")
    timer.lap("suppliers", short=short_prints)

    # Find all triples that define a 'suppkey' (as predicate).
    bgp_suppkey_matches = clans.superstrict(suppliers, clans.from_dict({"p": rdflib.URIRef("tpch:suppkey")}))
    # Give the subject a name for later joining and object the name we need in the output.
    bgp_suppkey = clans.compose(bgp_suppkey_matches, clans.from_dict({"supplier": "s", "suppkey": "o"}))

    # Find all triples that define a 'nationkey' (as predicate) and give the subject a name for
    # later joining and object the name we need in the output.
    bgp_nationkey = clans.compose(
        clans.superstrict(suppliers, clans.from_dict({"p": rdflib.URIRef("tpch:nationkey")})),
        clans.from_dict({"supplier": "s", "nationkey": "o"}),
    )

    # Join the previous results on 'supplier' and project the columns we need.
    supplier_solutions = clans.project(clans.functional_cross_union(bgp_suppkey, bgp_nationkey), "nationkey", "suppkey")
    timer.end("supplier_solutions", short=short_prints)

    return supplier_solutions
예제 #5
0
def get_customers_nations_projected(nations):
    """Execute the equivalent of the following SQL query, querying the CSV file customer.csv:
        SELECT
            custkey, nationkey, nationname
        FROM
            customer
        JOIN
            nations
        ON
            customer.nationkey = nations.nationkey
    """
    timer = FunctionTimer()
    short_prints = True

    customer_types = {'custkey': int, 'nationkey': int, 'acctbal': float}
    customers = csv.import_csv('customer.csv', customer_types)
    timer.lap('customers', short=short_prints)

    customers_nations = clans.cross_functional_union(customers, nations)
    timer.lap('customers_nations', short=short_prints)

    customers_nations_projected = clans.project(customers_nations,
                                                'custkey', 'nationkey', 'nationname')
    timer.end('customers_nations_projected', short=short_prints)

    return customers_nations_projected
예제 #6
0
def get_supplier_solutions():
    """Execute the equivalent of the following SPARQL query, querying the Turtle file supplier.ttl:
        SELECT
            ?suppkey, ?nationkey
        FROM
            supplier
        WHERE {
            ?supplier <tpch:suppkey> ?suppkey .
            ?supplier <tpch:nationkey> ?nationkey .
        }
    """
    timer = FunctionTimer()
    short_prints = True

    suppliers = rdf.import_graph('supplier.ttl')
    timer.lap('suppliers', short=short_prints)

    # Find all triples that define a 'suppkey' (as predicate).
    bgp_suppkey_matches = clans.superstrict(
        suppliers, clans.from_dict({'p': rdflib.URIRef('tpch:suppkey')}))
    # Give the subject a name for later joining and object the name we need in the output.
    bgp_suppkey = clans.compose(
        bgp_suppkey_matches, clans.from_dict({
            'supplier': 's',
            'suppkey': 'o'
        }))

    # Find all triples that define a 'nationkey' (as predicate) and give the subject a name for
    # later joining and object the name we need in the output.
    bgp_nationkey = clans.compose(
        clans.superstrict(
            suppliers, clans.from_dict({'p':
                                        rdflib.URIRef('tpch:nationkey')})),
        clans.from_dict({
            'supplier': 's',
            'nationkey': 'o'
        }))

    # Join the previous results on 'supplier' and project the columns we need.
    supplier_solutions = clans.project(
        clans.cross_functional_union(bgp_suppkey, bgp_nationkey), 'nationkey',
        'suppkey')
    timer.end('supplier_solutions', short=short_prints)

    return supplier_solutions
예제 #7
0
def get_orders_restricted_projected(startdate, enddate):
    """Execute the equivalent of the following SQL query, querying the CSV file orders.csv:
        SELECT
            orderkey, custkey
        FROM
            orders
        WHERE
            startdate <= orders.orderdate and orders.orderdate < enddate

    :param startdate: The lower boundary (inclusive) of the date range for the column 'orderdate'.
    :param enddate: The upper boundary (exclusive) of the date range for the column 'orderdate'.
    """
    timer = FunctionTimer()
    short_prints = True

    def read_date(date_str: str) -> datetime:
        return datetime.strptime(date_str, '%Y-%m-%d').date()

    orders_types = {
        'orderkey': int,
        'custkey': int,
        'orderdate': read_date,
        'totalprice': float,
        'shippriority': int
    }
    orders = csv.import_csv('orders.csv', orders_types)
    timer.lap('orders', short=short_prints)

    def select_dates(rel) -> bool:
        orderdate = rel('orderdate').value
        return (startdate <= orderdate) and (orderdate < enddate)

    orders_restricted = sets.restrict(orders, select_dates)
    timer.lap('orders_restricted', short=short_prints)

    orders_restricted_projected = clans.project(orders_restricted, 'orderkey',
                                                'custkey')
    timer.end('orders_restricted_projected', short=short_prints)

    return orders_restricted_projected
예제 #8
0
def project(clan: 'PP(M x M)', *lefts) -> 'PP(M x M)':
    clan = clans.project(clan, *lefts)
    return clan
예제 #9
0
def query5():
    # select
    #     nationname,
    #     sum(lineitem.extendedprice * (1 - lineitem.discount)) as revenue
    # from
    #     customer, orders, lineitem,   -- Loaded from CSV
    #     nation, region                -- Loaded from XML
    # where
    #     customer.custkey = orders.custkey
    #     and lineitem.orderkey = orders.orderkey
    #     and customer.nationkey = nation.nationkey
    #     and supplier.nationkey = nation.nationkey
    #     and nation.regionkey = region.regionkey
    #     and region.name = 'AMERICA'
    #     and orders.orderdate >= date '1996-01-01'
    #     and orders.orderdate < date '1996-01-01' + interval '1' year
    # group by
    #     n_name
    timer = FunctionTimer()
    short_prints = True

    # Join supplier_solutions and customers_nations_projected on 'nationkey'.
    result1 = clans.cross_functional_union(
        get_supplier_solutions(),
        get_customers_nations_projected(get_nations(region_name)))
    timer.lap('result1', short=short_prints)

    # Join result1 with orders_restricted_projected on 'custkey'.
    result2 = clans.cross_functional_union(
        result1, get_orders_restricted_projected(start_date, end_date))
    timer.lap('result2', short=short_prints)

    # Join result with lineitem on 'orderkey' and 'suppkey'.
    lineitem_types = {
        'orderkey': int,
        'suppkey': int,
        'extendedprice': float,
        'discount': float,
        'partkey': int,
        'linenumber': int,
        'quantity': int,
        'tax': float,
    }
    result3 = clans.cross_functional_union(
        result2, csv.import_csv('lineitem.csv', lineitem_types))
    timer.lap('result3', short=short_prints)

    # Add the 'revenue' column.
    def calc_revenue(rel):
        return Couplet(
            'revenue',
            rel('extendedprice').value * (1 - rel('discount').value))

    result4 = Set(
        relations.functional_add(rel, calc_revenue(rel)) for rel in result3)
    timer.lap('result4', short=short_prints)
    # Remove unnecessary columns.
    revenue_by_nations = clans.project(result4, 'revenue', 'nationname')

    # Partition the result on 'nationname'.
    revenue_grouped_by_nations = partition.partition(
        revenue_by_nations, lambda rel: rel('nationname'))
    timer.lap('revenue_grouped_by_nations', short=short_prints)

    # Almost generic aggregation function. (Handles 'normal' cases, but not all edge cases.)
    def aggregate(horde, group_left, aggregation_left, aggregate_func):
        aggregation = {}
        for clan in horde:
            aggregation_value = aggregate_func.identity
            for relation in clan:
                aggregation_value = aggregate_func(
                    aggregation_value,
                    relation(aggregation_left).value)
            first_relation = next(iter(clan))
            aggregation[first_relation(group_left)] = aggregation_value
        return Set([
            Set(Couplet(group_left, key),
                Couplet(aggregation_left, aggregation[key]))
            for key in aggregation
        ])

    # Our aggregation function (adding two numbers, identity is 0).
    def aggregate_sum(arg1, arg2):
        return arg1 + arg2

    aggregate_sum.identity = 0

    # Calculate the aggregation result.
    # noinspection PyTypeChecker
    query5_result = aggregate(revenue_grouped_by_nations, 'nationname',
                              'revenue', aggregate_sum)
    timer.end('query5_result')

    return query5_result
예제 #10
0
print('america_region_name:\n' + mo_to_str(america_region_name))

america_region = superstrict(regions, america_region_name)
print('america_region:\n' + mo_to_str(america_region))

america_nations_names = america_region['nation']['name']
print('america_nations_names:\n' + mo_to_str(america_nations_names))

# Get all region key and nation name pairs
# for $x in doc("regions.xml")/regions/region/nation/name return <pair>{$x/../../regionkey}{$x}</pair>
from algebraixlib.mathobjects import Set
from algebraixlib.algebras.clans import project, cross_union
from algebraixlib.algebras.sets import union
region_key_nation_name_pairs_accumulator = Set()
for region in regions:
    region_key = project(Set(region), 'regionkey')
    print('region_key:\n' + mo_to_str(region_key))

    region_nations = region['nation']
    print('region_nations:\n' + mo_to_str(region_nations))

    region_nation_names = project(region_nations, 'name')
    print('region_nation_names:\n' + mo_to_str(region_nation_names))

    region_key_nation_name_pairs = cross_union(region_key, region_nation_names)
    print('region_key_nation_name_pairs:\n' + mo_to_str(region_key_nation_name_pairs))

    region_key_nation_name_pairs_accumulator = union(region_key_nation_name_pairs_accumulator, region_key_nation_name_pairs)
    print('region_key_nation_name_pairs_accumulator:\n' + mo_to_str(region_key_nation_name_pairs_accumulator))

print('region_key_nation_name_pairs_accumulator:\n' + mo_to_str(region_key_nation_name_pairs_accumulator))
예제 #11
0
print('america_region_name:\n' + mo_to_str(america_region_name))

america_region = superstrict(regions, america_region_name)
print('america_region:\n' + mo_to_str(america_region))

america_nations_names = america_region['nation']['name']
print('america_nations_names:\n' + mo_to_str(america_nations_names))

# Get all region key and nation name pairs
# for $x in doc("regions.xml")/regions/region/nation/name return <pair>{$x/../../regionkey}{$x}</pair>
from algebraixlib.mathobjects import Set
from algebraixlib.algebras.clans import project, cross_union
from algebraixlib.algebras.sets import union
region_key_nation_name_pairs_accumulator = Set()
for region in regions:
    region_key = project(Set(region), 'regionkey')
    print('region_key:\n' + mo_to_str(region_key))

    region_nations = region['nation']
    print('region_nations:\n' + mo_to_str(region_nations))

    region_nation_names = project(region_nations, 'name')
    print('region_nation_names:\n' + mo_to_str(region_nation_names))

    region_key_nation_name_pairs = cross_union(region_key, region_nation_names)
    print('region_key_nation_name_pairs:\n' +
          mo_to_str(region_key_nation_name_pairs))

    region_key_nation_name_pairs_accumulator = union(
        region_key_nation_name_pairs_accumulator, region_key_nation_name_pairs)
    print('region_key_nation_name_pairs_accumulator:\n' +
예제 #12
0
def project(clan: 'PP(M x M)', *lefts) -> 'PP(M x M)':
    clan = clans.project(clan, *lefts)
    return clan
예제 #13
0
def query5():
    # select
    #     nationname,
    #     sum(lineitem.extendedprice * (1 - lineitem.discount)) as revenue
    # from
    #     customer, orders, lineitem,   -- Loaded from CSV
    #     nation, region                -- Loaded from XML
    # where
    #     customer.custkey = orders.custkey
    #     and lineitem.orderkey = orders.orderkey
    #     and customer.nationkey = nation.nationkey
    #     and supplier.nationkey = nation.nationkey
    #     and nation.regionkey = region.regionkey
    #     and region.name = 'AMERICA'
    #     and orders.orderdate >= date '1996-01-01'
    #     and orders.orderdate < date '1996-01-01' + interval '1' year
    # group by
    #     n_name
    timer = FunctionTimer()
    short_prints = True

    # Join supplier_solutions and customers_nations_projected on 'nationkey'.
    result1 = clans.functional_cross_union(
        get_supplier_solutions(), get_customers_nations_projected(get_nations(region_name))
    )
    timer.lap("result1", short=short_prints)

    # Join result1 with orders_restricted_projected on 'custkey'.
    result2 = clans.functional_cross_union(result1, get_orders_restricted_projected(start_date, end_date))
    timer.lap("result2", short=short_prints)

    # Join result with lineitem on 'orderkey' and 'suppkey'.
    lineitem_types = {
        "orderkey": int,
        "suppkey": int,
        "extendedprice": float,
        "discount": float,
        "partkey": int,
        "linenumber": int,
        "quantity": int,
        "tax": float,
    }
    result3 = clans.functional_cross_union(result2, csv.import_csv("lineitem.csv", lineitem_types))
    timer.lap("result3", short=short_prints)

    # Add the 'revenue' column.
    def calc_revenue(rel):
        return Couplet("revenue", rel("extendedprice").value * (1 - rel("discount").value))

    result4 = Set(relations.functional_add(rel, calc_revenue(rel)) for rel in result3)
    timer.lap("result4", short=short_prints)
    # Remove unnecessary columns.
    revenue_by_nations = clans.project(result4, "revenue", "nationname")

    # Partition the result on 'nationname'.
    revenue_grouped_by_nations = partition.partition(revenue_by_nations, lambda rel: rel("nationname"))
    timer.lap("revenue_grouped_by_nations", short=short_prints)

    # Almost generic aggregation function. (Handles 'normal' cases, but not all edge cases.)
    def aggregate(horde, group_left, aggregation_left, aggregate_func):
        aggregation = {}
        for clan in horde:
            aggregation_value = aggregate_func.identity
            for relation in clan:
                aggregation_value = aggregate_func(aggregation_value, relation(aggregation_left).value)
            first_relation = next(iter(clan))
            aggregation[first_relation(group_left)] = aggregation_value
        return Set([Set(Couplet(group_left, key), Couplet(aggregation_left, aggregation[key])) for key in aggregation])

    # Our aggregation function (adding two numbers, identity is 0).
    def aggregate_sum(arg1, arg2):
        return arg1 + arg2

    aggregate_sum.identity = 0

    # Calculate the aggregation result.
    # noinspection PyTypeChecker
    query5_result = aggregate(revenue_grouped_by_nations, "nationname", "revenue", aggregate_sum)
    timer.end("query5_result")

    return query5_result