def protocol(): cols_in1 = [ defCol("companyID", "INTEGER", [1]), defCol("price", "INTEGER", [1]) ] in1 = sal.create("green1", cols_in1, {1}) cols_in2 = [ defCol("companyID", "INTEGER", [2]), defCol("price", "INTEGER", [2]) ] in2 = sal.create("green2", cols_in2, {2}) cols_in3 = [ defCol("companyID", "INTEGER", [3]), defCol("price", "INTEGER", [3]) ] in3 = sal.create("green3", cols_in3, {3}) cab_data = sal.concat([in1, in2, in3], "cab_data") selected_input = sal.project(cab_data, "selected_input", ["companyID", "price"]) local_rev = sal.aggregate(selected_input, "local_rev", ["companyID"], "price", "+", "local_rev") scaled_down = sal.divide(local_rev, "scaled_down", "local_rev", ["local_rev", 1000]) first_val_blank = sal.multiply(scaled_down, "first_val_blank", "companyID", ["companyID", 0]) local_rev_scaled = sal.multiply(first_val_blank, "local_rev_scaled", "local_rev", ["local_rev", 100]) total_rev = sal.aggregate(first_val_blank, "total_rev", ["companyID"], "local_rev", "+", "global_rev") local_total_rev = sal.join(local_rev_scaled, total_rev, "local_total_rev", ["companyID"], ["companyID"]) market_share = sal.divide(local_total_rev, "market_share", "local_rev", ["local_rev", "global_rev"]) market_share_squared = sal.multiply(market_share, "market_share_squared", "local_rev", ["local_rev", "local_rev", 1]) hhi = sal.aggregate(market_share_squared, "hhi", ["companyID"], "local_rev", "+", "hhi") sal.collect(hhi, 1) # return root nodes return {in1, in2, in3}
def protocol(): cols_in_1 = [ defCol("companyID", "INTEGER", [1]), defCol("price", "INTEGER", [1]) ] in_1 = cc.create("yellow1", cols_in_1, {1}) cols_in_2 = [ defCol("companyID", "INTEGER", [2]), defCol("price", "INTEGER", [2]) ] in_2 = cc.create("yellow2", cols_in_2, {2}) cols_in_3 = [ defCol("companyID", "INTEGER", [3]), defCol("price", "INTEGER", [3]) ] in_3 = cc.create("yellow3", cols_in_3, {3}) cab_data = cc.concat([in_1, in_2, in_3], "cab_data") selected_input = cc.project(cab_data, "selected_input", ["companyID", "price"]) local_rev = cc.aggregate(selected_input, "local_rev", ["companyID"], "price", "sum", "local_rev") scaled_down = cc.divide(local_rev, "scaled_down", "local_rev", ["local_rev", 1000]) first_val_blank = cc.multiply(scaled_down, "first_val_blank", "companyID", ["companyID", 0]) local_rev_scaled = cc.multiply(first_val_blank, "local_rev_scaled", "local_rev", ["local_rev", 100]) total_rev = cc.aggregate(first_val_blank, "total_rev", ["companyID"], "local_rev", "sum", "global_rev") local_total_rev = cc.join(local_rev_scaled, total_rev, "local_total_rev", ["companyID"], ["companyID"]) market_share = cc.divide(local_total_rev, "market_share", "local_rev", ["local_rev", "global_rev"]) market_share_squared = cc.multiply(market_share, "market_share_squared", "local_rev", ["local_rev", "local_rev", 1]) hhi = cc.aggregate(market_share_squared, "hhi", ["companyID"], "local_rev", "sum", "hhi") # dummy projection to force non-mpc subdag hhi_only = cc.project(hhi, "hhi_only", ["companyID", "hhi"]) cc.collect(hhi_only, 1) # return root nodes return {in_1, in_2, in_3}
def protocol(): # define inputs colsIn1 = [ defCol("companyID", "INTEGER", [1]), defCol("price", "INTEGER", [1]) ] in1 = sal.create("in1", colsIn1, set([1])) colsIn2 = [ defCol("companyID", "INTEGER", [2]), defCol("price", "INTEGER", [2]) ] in2 = sal.create("in2", colsIn2, set([2])) colsIn3 = [ defCol("companyID", "INTEGER", [3]), defCol("price", "INTEGER", [3]) ] in3 = sal.create("in3", colsIn3, set([3])) cl1 = sal._close(in1, "cl1", set([1, 2, 3])) cl2 = sal._close(in2, "cl2", set([1, 2, 3])) cl3 = sal._close(in3, "cl3", set([1, 2, 3])) cab_data = sal.concat([cl1, cl2, cl3], "cab_data") selected_input = sal.project(cab_data, "selected_input", ["companyID", "price"]) local_rev = sal.aggregate(selected_input, "local_rev", ["companyID"], "price", "+", "local_rev") scaled_down = sal.divide(local_rev, "scaled_down", "local_rev", ["local_rev", 1000]) first_val_blank = sal.multiply(scaled_down, "first_val_blank", "companyID", ["companyID", 0]) local_rev_scaled = sal.multiply(first_val_blank, "local_rev_scaled", "local_rev", ["local_rev", 100]) total_rev = sal.aggregate(first_val_blank, "total_rev", ["companyID"], "local_rev", "+", "global_rev") local_total_rev = sal.join(local_rev_scaled, total_rev, "local_total_rev", ["companyID"], ["companyID"]) market_share = sal.divide(local_total_rev, "market_share", "local_rev", ["local_rev", "global_rev"]) market_share_squared = sal.multiply(market_share, "market_share_squared", "local_rev", ["local_rev", "local_rev", 1]) hhi = sal.aggregate(market_share_squared, "hhi", ["companyID"], "local_rev", "+", "hhi") hhi_opened = sal._open(hhi, "hhi_opened", 1) # return root nodes return set([in1, in2, in3])
def protocol(): # define inputs cols_in_1 = [ defCol("a", "INTEGER", [1]), defCol("b", "INTEGER", [1]) ] in_1 = cc.create("in_1", cols_in_1, {1}) cols_in_2 = [ defCol("a", "INTEGER", [2]), defCol("b", "INTEGER", [2]) ] in_2 = cc.create("in2", cols_in_2, {2}) cols_in_3 = [ defCol("a", "INTEGER", [3]), defCol("b", "INTEGER", [3]) ] in_3 = cc.create("in_3", cols_in_3, {3}) # combine parties' inputs into one relation rel = cc.concat([in_1, in_2, in_3], "rel") proj = cc.project(rel, "proj", ["a", "b"]) agg = cc.aggregate(proj, "agg", ["a"], "b", "sum", "total_b") div = cc.divide(agg, "div", "a", ["a", 1]) mult = cc.multiply(div, "mult", "a", ["a", 1]) cc.collect(mult, 1) # return root nodes return {in_1, in_2, in_3}
def mult_mixed(): inputs, rel = setup() res = sal.multiply(rel, "res", "a", ["a", "b"]) opened = sal._open(res, "opened", 1) return inputs
def protocol(): inputs, rel = setup() mult = sal.multiply(rel, 'mult1', 'a', ['a', 'b']) opened = sal._open(mult, "opened", 1) return inputs
def protocol(): # define inputs cols_in_1 = [ defCol("a", "INTEGER", [1]), defCol("b", "INTEGER", [1]) ] in_1 = sal.create("in_1", cols_in_1, set([1])) cols_in_2 = [ defCol("a", "INTEGER", [2]), defCol("b", "INTEGER", [2]) ] in_2 = sal.create("in2", cols_in_2, set([2])) cols_in_3 = [ defCol("a", "INTEGER", [3]), defCol("b", "INTEGER", [3]) ] in_3 = sal.create("in_3", cols_in_3, set([3])) # combine parties' inputs into one relation rel = sal.concat([in_1, in_2, in_3], "rel") proj = sal.project(rel, "proj", ["a", "b"]) agg = sal.aggregate(proj, "agg", ["a"], "b", "+", "total_b") div = sal.divide(agg, "div", "a", ["a", 1]) mult = sal.multiply(div, "mult", "a", ["a", 1]) sal.collect(mult, 1) # return root nodes return set([in_1, in_2, in_3])
def mult_by_const(): inputs, rel = setup() res = sal.multiply(rel, "res", "a", ["a", 10]) opened = sal._open(res, "opened", 1) return inputs
def protocol(): inpts = setup() in_1 = inpts[0] mult = sal.multiply(in_1, "mult", "a", ["a", "b"]) out = sal.collect(mult, 1) return set([in_1])
def protocol(): cols_in_1 = [ defCol("companyID", "INTEGER", [1]), defCol("price", "INTEGER", [1]) ] cols_in_2 = [ defCol("companyID", "INTEGER", [1]), defCol("price", "INTEGER", [1]) ] cols_in_3 = [ defCol("companyID", "INTEGER", [1]), defCol("price", "INTEGER", [1]) ] yellow1 = sal.create("yellow1", cols_in_1, {1}) yellow2 = sal.create("yellow2", cols_in_2, {1}) yellow3 = sal.create("yellow3", cols_in_3, {1}) cab_data = sal.concat([yellow1, yellow2, yellow3], "cab_data") selected_input = sal.project(cab_data, "selected_input", ["companyID", "price"]) local_rev = sal.aggregate(selected_input, "local_rev", ["companyID"], "price", "+", "local_rev") scaled_down = sal.divide(local_rev, "scaled_down", "local_rev", ["local_rev", 1000]) first_val_blank = sal.multiply(scaled_down, "first_val_blank", "companyID", ["companyID", 0]) local_rev_scaled = sal.multiply(first_val_blank, "local_rev_scaled", "local_rev", ["local_rev", 100]) total_rev = sal.aggregate(first_val_blank, "total_rev", ["companyID"], "local_rev", "+", "global_rev") local_total_rev = sal.join(local_rev_scaled, total_rev, "local_total_rev", ["companyID"], ["companyID"]) market_share = sal.divide(local_total_rev, "market_share", "local_rev", ["local_rev", "global_rev"]) market_share_squared = sal.multiply(market_share, "market_share_squared", "local_rev", ["local_rev", "local_rev", 1]) hhi = sal.aggregate(market_share_squared, "hhi", ["companyID"], "local_rev", "+", "hhi") sal.collect(hhi, 1) return {yellow1, yellow2, yellow3}
def protocol(): inpts = setup() in_1 = inpts[0] mult = cc.multiply(in_1, "mult", "a", ["a", "b"]) out = cc.collect(mult, 1) return {in_1}
def protocol(): colsInA = [ defCol('a', 'INTEGER', [1]), defCol('b', 'INTEGER', [1]), ] in1 = sal.create("in1", colsInA, set([1])) mult1 = sal.multiply(in1, 'mult1', 'a', ['a', 'b']) return set([in1])
def protocol(): inpts = setup() in_1, in_2 = inpts[0], inpts[1] mult = sal.multiply(in_1, "mult", "a", ["b", "c"]) proj_2 = sal.project(in_2, "proj_2", ["a", "b"]) join = sal.join(mult, proj_2, "join", ["a", "b"], ["a", "b"]) agg = sal.aggregate(join, "agg", ["a", "b"], "c", "sum", "agg_1") out = sal.collect(agg, 1) return {in_1, in_2}
def protocol(): inpts = setup() in_1, in_2 = inpts[0], inpts[1] div_1 = sal.divide(in_1, "div", "a", ["a", "b"]) mult_2 = sal.multiply(in_2, "mult", "a", ["a", "b"]) proj_1 = sal.project(div_1, "proj", ["a", "b"]) join = sal.join(proj_1, mult_2, "join", ["a", "b"], ["a", "b"]) agg = sal.aggregate(join, "agg", ["a", "b"], "c", "sum", "agg_1") out = sal.collect(agg, 1) return set([in_1, in_2])
def protocol(): inpts = setup() in_1, in_2 = inpts[0], inpts[1] div_1 = cc.divide(in_1, "div", "a", ["a", "b"]) mult_2 = cc.multiply(in_2, "mult", "a", ["a", "b"]) proj_1 = cc.project(div_1, "proj", ["a", "b"]) join = cc.join(proj_1, mult_2, "join", ["a", "b"], ["a", "b"]) agg = cc.aggregate(join, "agg", ["a", "b"], "c", "sum", "agg_1") cc.collect(agg, 1) return {in_1, in_2}
def protocol(): colsInA = [ defCol("store_code_uc", "STRING", [1]), defCol("upc", "STRING", [1]), defCol("week_end", "STRING", [1]), defCol("units", "INTEGER", [1]), defCol("prmult", "INTEGER", [1]), defCol("price", "FLOAT", [1]), defCol("retailer_code", "STRING", [1]), defCol("store_zip3", "STRING", [1]) ] create = sal.create("movement", colsInA, set([1])) # divides 'price' by 'prmult' to compute unit price. w_unit_p = sal.divide(create, "w_unit_p", 'unit_price', ['price', 'prmult']) # aggregate multiple entries for the same (store, product, week) combination sum_units = sal.aggregate(w_unit_p, 'sum_units', ['store_code_uc', 'upc', 'week_end'], 'units', '+', 'q') # add 'unit_price' to each row keyed by (store, product, week) total_units = sal.join(w_unit_p, sum_units, 'total_units', ['store_code_uc', 'upc', 'week_end'], ['store_code_uc', 'upc', 'week_end']) # computed weighted unit price (multiply aggregate units sold by their per-unit price) wghtd_total = sal.multiply(total_units, 'wghtd_total', 'wghtd_unit_p', ['units', 'unit_price']) # compute some kind of weighted per-unit price by dividing by 'q' (total units sold) wghtd_total_final = sal.divide(wghtd_total, 'wghtd_total_final', 'wghtd_unit_p', ['wghtd_unit_p', 'q']) total_unit_wghts = sal.aggregate(wghtd_total_final, 'total_unit_wghts', ['store_code_uc', 'upc', 'week_end'], 'wghtd_unit_p', '+', 'avg_unit_p') # merge in avg_unit_p final_join = sal.join(total_units, total_unit_wghts, 'final_join', ['store_code_uc', 'upc', 'week_end'], ['store_code_uc', 'upc', 'week_end']) selected_cols = sal.project(final_join, 'selected_cols', [ 'store_code_uc', 'upc', 'week_end', 'q', 'avg_unit_p', 'retailer_code', 'store_zip3' ]) opened = sal.collect(selected_cols, 1) return set([create])
def protocol(): # define inputs cols_in_1 = [ defCol("a", "INTEGER", [1]), defCol("b", "INTEGER", [1]) ] in_1 = cc.create("in_1", cols_in_1, {1}) cols_in_2 = [ defCol("a", "INTEGER", [2]), defCol("b", "INTEGER", [2]) ] in_2 = cc.create("in_2", cols_in_2, {2}) # combine parties' inputs into one relation rel = cc.concat([in_1, in_2], "rel") # specify the workflow mult = cc.multiply(rel, "mult", "a", ["a", 1]) cc.collect(mult, 1) # return root nodes return {in_1, in_2}
def protocol(): """ A demo protocol which reads data from data/input_relation.csv, computes a multiplication, followed by an aggregation, and stores the result under data/aggregated.csv. :return set of input relations """ # define the input schema, providing column name, type, and trust set input_columns = [ defCol("column_a", "INTEGER", [1]), defCol("column_b", "INTEGER", [1]) ] # define input relation, providing relation name, columns, and owner set input_relation = lang.create("input_relation", input_columns, {1}) # square column_b, i.e., compute (column_a, column_b) -> (column_a, column_b * column_b) squared = lang.multiply(input_relation, "squared", "column_b", ["column_b", "column_b"]) # sum group by column_a on column_b and rename group-over column to summed lang.aggregate(squared, "aggregated", ["column_a"], "column_b", "+", "summed") # leaf nodes are automatically written to file so aggregated will be written to ./data/aggregated.csv # return all input relations return {input_relation}
def protocol(): # define inputs cols_in_1 = [ defCol("a", "INTEGER", [1]), defCol("b", "INTEGER", [1]) ] in_1 = sal.create("in1", cols_in_1, set([1])) cols_in_2 = [ defCol("a", "INTEGER", [2]), defCol("b", "INTEGER", [2]) ] in_2 = sal.create("in_2", cols_in_2, set([2])) # combine parties' inputs into one relation rel = sal.concat([in_1, in_2], "rel") # specify the workflow mult = sal.multiply(rel, "mult", "a", ["a", 0]) sal.collect(mult, 1) # return root nodes return set([in_1, in_2])
def protocol(): cols_concatenated_DFs = [ defCol("store_code_uc", "STRING", [1]), defCol('upc', 'STRING', [1]), defCol('week_end', 'STRING', [1]), defCol('q', 'INTEGER', [1]), defCol('avg_unit_p', 'FLOAT', [1]), defCol('retailer_code', 'STRING', [1]), defCol('store_zip3', 'STRING', [1]) ] cols_temp_UPC_brandBU_crspnd = [ defCol('brand_code_bu', 'STRING', [2]), defCol('brand_descr_bu', 'STRING', [2]), defCol('upc', 'STRING', [2]), defCol('size1_amount', 'FLOAT', [2]), ] # concatenated DFs from local_workflow.py concatenated_DFs = sal.create('concatenated_DFs', cols_concatenated_DFs, set([1])) # the output of preprocess_products.py temp_UPC_brandBU_crspnd = sal.create('temp_UPC_brandBU_crspnd', cols_temp_UPC_brandBU_crspnd, set([1])) ''' SECTION 1 Compute the quantity weighted average price per unit & total quantity sold at the store-brand level ''' w_upc = sal.join(concatenated_DFs, temp_UPC_brandBU_crspnd, 'w_upc', ['upc'], ['upc']) w_avg_OZ_p = sal.divide(w_upc, 'w_avg_OZ_p', 'avg_OZ_p', ['avg_unit_p', 'size1_amount']) w_q_upd = sal.multiply(w_avg_OZ_p, 'w_q_upd', 'q', ['q', 'size1_amount']) brand_OZq_sum = sal.aggregate( w_q_upd, 'brand_OZq_sum', ['store_code_uc', 'brand_code_bu', 'week_end'], 'q', '+', 'brand_OZq') total_brnd_OZq = sal.join(w_q_upd, brand_OZq_sum, 'total_brnd_OZq', ['store_code_uc', 'brand_code_bu', 'week_end'], ['store_code_uc', 'brand_code_bu', 'week_end']) w_wghtd_OZ_brnd_p = sal.multiply(total_brnd_OZq, 'w_wghtd_OZ_brnd_p', 'wghtd_OZ_brnd_p', ['q', 'avg_OZ_p']) w_wghtd_OZ_brnd_p_final = sal.divide(w_wghtd_OZ_brnd_p, 'w_wghtd_OZ_brnd_p_final', 'wghtd_OZ_brnd_p', ['wghtd_OZ_brnd_p', 'brand_OZq']) brnd_p_sum = sal.aggregate(w_wghtd_OZ_brnd_p_final, 'brnd_p_sum', ['store_code_uc', 'brand_code_bu', 'week_end'], 'wghtd_OZ_brnd_p', '+', 'avg_OZ_brnd_p') result = sal.join(brnd_p_sum, w_wghtd_OZ_brnd_p_final, 'result', ['store_code_uc', 'brand_code_bu', 'week_end'], ['store_code_uc', 'brand_code_bu', 'week_end']) section_one_result = sal.project(result, 'section_one_result', [ "avg_OZ_brnd_p", "week_end", "store_code_uc", "brand_code_bu", "brand_descr_bu", "brand_OZq", 'retailer_code', 'store_zip3', 'q' ]) ''' SECTION 2 Compute the average price per OZ & total OZs sold for each brand at the retailer-$geo_unit level, by compiling the store level data that comprises each retailer-$geo_unit. Compute the total quantity sold by each retailer-$geo_unit ''' temp_sum = sal.aggregate( section_one_result, 'temp_sum', ['store_zip3', 'retailer_code', 'brand_code_bu', 'week_end'], 'brand_OZq', '+', 'brand_OZq') result_brnd_sum = sal.join( section_one_result, temp_sum, 'result_brnd_sum', ['store_zip3', 'retailer_code', 'brand_code_bu', 'week_end'], ['store_zip3', 'retailer_code', 'brand_code_bu', 'week_end']) wghtd_p_mult = sal.multiply(result_brnd_sum, 'wghtd_p_mult', 'wghtd_p', ['brand_OZq', 'avg_OZ_brnd_p']) wghtd_p_final = sal.divide(wghtd_p_mult, 'wghtd_p_final', 'wghtd_p', ['wghtd_p', 'q']) wghtd_p_sum = sal.aggregate( wghtd_p_final, 'wghtd_p_sum', ['store_zip3', 'retailer_code', 'brand_code_bu', 'week_end'], 'wghtd_p', '+', 'p') sec_4_result = sal.join( wghtd_p_final, wghtd_p_sum, 'sec_4_result', ['store_zip3', 'retailer_code', 'brand_code_bu', 'week_end'], ['store_zip3', 'retailer_code', 'brand_code_bu', 'week_end']) # TODO: filter out sec_4_result rows where 'store_zip3' cell is empty final = sal.project(sec_4_result, 'final', [ 'store_zip3', 'retailer_code', 'week_end', 'brand_code_bu', 'brand_descr_bu', 'q', 'p' ]) opened = sal.collect(final, 1) return set([concatenated_DFs, temp_UPC_brandBU_crspnd])