def setup(conf: Dict): pid = conf["pid"] hdfs_node_name = conf["spark"]["hdfs"]["node_name"] hdfs_root = conf["spark"]["hdfs"]["root"] spark_master_url = conf["spark"]["master_url"] workflow_name = conf["workflow_name"] sm_config = SharemindCodeGenConfig(conf["code_path"]) spark_config = SparkConfig(spark_master_url) conclave_config = CodeGenConfig(workflow_name) \ .with_sharemind_config(sm_config) \ .with_spark_config(spark_config) conclave_config.code_path = conf["code_path"] + workflow_name conclave_config.input_path = "hdfs://{}/{}/{}".format( hdfs_node_name, hdfs_root, conf["name"]) conclave_config.output_path = "hdfs://{}/{}/{}".format( hdfs_node_name, hdfs_root, conf["name"]) conclave_config.pid = pid conclave_config.name = workflow_name conclave_config.all_pids = [int(p) for p in conf["all_pids"]] network_config = NetworkConfig(conf["sharemind"]["parties"], pid) conclave_config.with_network_config(network_config) return conclave_config
def join(namenode, root, f_size, master_url): @dag_only def protocol(): colsInA = [ defCol('a', 'INTEGER', [1]), defCol('b', 'INTEGER', [1]), ] colsInB = [ defCol('a', 'INTEGER', [1]), defCol('c', 'INTEGER', [1]), ] in1 = sal.create("in1", colsInA, set([1])) in2 = sal.create("in2", colsInB, set([1])) join1 = sal.join(in1, in2, 'join1', ['a'], ['a']) return set([in1, in2]) dag = protocol() config = CodeGenConfig('join_spark_{}'.format(f_size)) config.code_path = "/mnt/shared/" + config.name config.input_path = "hdfs://{}/{}/{}" \ .format(namenode, root, f_size) config.output_path = "hdfs://{}/{}/join_sp{}" \ .format(namenode, root, f_size) cg = spark.SparkCodeGen(config, dag) job = cg.generate(config.name, config.output_path) job_queue = [job] dis.dispatch_all(master_url, None, job_queue)
def scalar_div(namenode, root, f_size, master_url): @dag_only def protocol(): colsInA = [ defCol('a', 'INTEGER', [1]), defCol('b', 'INTEGER', [1]), ] in1 = sal.create("in1", colsInA, set([1])) div1 = sal.divide(in1, 'div1', 'a', ['a', 5]) return set([in1]) dag = protocol() config = CodeGenConfig('scalar_div_spark_{}'.format(f_size)) config.code_path = "/mnt/shared/" + config.name config.input_path = "hdfs://{}/{}/{}" \ .format(namenode, root, f_size) config.output_path = "hdfs://{}/{}/scalar_div_sp{}" \ .format(namenode, root, f_size) cg = spark.SparkCodeGen(config, dag) job = cg.generate(config.name, config.output_path) job_queue = [job] dis.dispatch_all(master_url, None, job_queue)
def project(namenode, root, f_size, master_url): @dag_only def protocol(): colsInA = [ defCol('a', 'INTEGER', [1]), defCol('b', 'INTEGER', [1]), defCol('c', 'INTEGER', [1]), defCol('d', 'INTEGER', [1]) ] in1 = sal.create("in1", colsInA, set([1])) cols = ([column.name for column in in1.out_rel.columns]) shuffle(cols) proja = sal.project(in1, "proja", cols) return set([in1]) dag = protocol() config = CodeGenConfig('project_spark_{}'.format(f_size)) config.code_path = "/mnt/shared/" + config.name config.input_path = "hdfs://{}/{}/{}" \ .format(namenode, root, f_size) config.output_path = "hdfs://{}/{}/project_sp{}" \ .format(namenode, root, f_size) cg = spark.SparkCodeGen(config, dag) job = cg.generate(config.name, config.output_path) job_queue = [job] dis.dispatch_all(master_url, None, job_queue)
def no_hdfs(): pid = int(sys.argv[1]) num_tuples = sys.argv[2] workflow_name = "sharemind_join_{}_{}".format(num_tuples, pid) sm_cg_config = SharemindCodeGenConfig( workflow_name, "/mnt/shared", use_hdfs=False, use_docker=True) codegen_config = CodeGenConfig( workflow_name).with_sharemind_config(sm_cg_config) codegen_config.code_path = "/mnt/shared/" + workflow_name codegen_config.input_path = "/mnt/shared/join/" + num_tuples codegen_config.output_path = "/mnt/shared/join/" + num_tuples sharemind_config = { "pid": pid, "parties": { 1: {"host": "ca-spark-node-0", "port": 9001}, 2: {"host": "cb-spark-node-0", "port": 9002}, 3: {"host": "cc-spark-node-0", "port": 9003} } } sm_peer = conclave.net.setup_peer(sharemind_config) join(pid, codegen_config, sm_peer, num_tuples)
def party_proc(pid): sharemind_home = "/home/sharemind/Sharemind-SDK/sharemind/client" spark_master = "local" sharemind_config = { "pid": pid, "parties": { 1: { "host": "localhost", "port": 9001 }, 2: { "host": "localhost", "port": 9002 }, 3: { "host": "localhost", "port": 9003 } } } peer = conclave.net.setup_peer(sharemind_config) codegen_config = CodeGenConfig() job = SharemindCodeGen(codegen_config, join(), pid).generate("job-" + str(pid), sharemind_home) job_queue = [job] conclave.dispatch.dispatch_all(spark_master, peer, job_queue)
def test_ssn(self): def protocol(): govreg_cols = [ defCol("a", "INTEGER", [1]), defCol("b", "INTEGER", [1]) ] govreg = cc.create("a_govreg", govreg_cols, {1}) govreg_dummy = cc.project(govreg, "govreg_dummy", ["a", "b"]) company0_cols = [ defCol("c", "INTEGER", [1], [2]), defCol("d", "INTEGER", [2]) ] company0 = cc.create("company0", company0_cols, {2}) company0_dummy = cc.project(company0, "company0_dummy", ["c", "d"]) company1_cols = [ defCol("c", "INTEGER", [1], [3]), defCol("d", "INTEGER", [3]) ] company1 = cc.create("company1", company1_cols, {3}) company1_dummy = cc.project(company1, "company1_dummy", ["c", "d"]) companies = cc.concat([company0_dummy, company1_dummy], "companies") joined = cc.join(govreg_dummy, companies, "joined", ["a"], ["c"]) res = cc.aggregate(joined, "actual", ["b"], "d", "sum", "total") cc.collect(res, 1) return {govreg, company0, company1} dag = rewrite_dag(ccdag.OpDag(protocol()), use_leaky_ops=True) actual = ScotchCodeGen(CodeGenConfig(), dag)._generate(0, 0) self.check_workflow(actual, "ssn_leaky")
def generate(dag, name): cfg = CodeGenConfig('cfg') cg = SparkCodeGen(cfg, dag) actual = cg._generate('code', '/tmp')[1] with open('/tmp/' + name + '.py', 'w') as out: out.write(actual)
def check_workflow(self, dag, name): expected_rootdir = \ "{}/spark_expected".format(os.path.dirname(os.path.realpath(__file__))) cfg = CodeGenConfig('cfg') cg = SparkCodeGen(cfg, dag) actual = cg._generate('code', '/tmp')[1] with open(expected_rootdir + '/{}'.format(name), 'r') as f: expected = f.read() self.assertEqual(expected, actual)
def check_workflow(self, dag, name: str): expected_rootdir = \ "{}/python_expected".format(os.path.dirname(os.path.realpath(__file__))) cfg = CodeGenConfig('cfg') cg = PythonCodeGen(cfg, dag) actual = cg._generate('code', '/tmp')[1] with open(expected_rootdir + '/{}'.format(name), 'r') as f_specific, open( expected_rootdir + '/{}'.format("base"), 'r') as f_base: expected_base = f_base.read() expected_specific = f_specific.read() expected = expected_base + expected_specific self.assertEqual(expected, actual)
def test_hybrid_agg_opt(self): def protocol(): cols_in_1 = [ defCol("a", "INTEGER", [1]), defCol("b", "INTEGER", [1]) ] in_1 = cc.create("in_1", cols_in_1, {1}) cols_in_2 = [ defCol("a", "INTEGER", [1], [2]), defCol("b", "INTEGER", [2]) ] in_2 = cc.create("in_2", cols_in_2, {2}) cc.collect( cc.aggregate(cc.concat([in_1, in_2], "rel"), "agg", ["a"], "b", "sum", "total_b"), 1) return {in_1, in_2} dag = rewrite_dag(ccdag.OpDag(protocol()), use_leaky_ops=True) actual = ScotchCodeGen(CodeGenConfig(), dag)._generate(0, 0) self.check_workflow(actual, "hybrid_agg_leaky")
def test_public_join(self): def protocol(): left_one_cols = [ defCol("a", "INTEGER", 1, 2, 3), defCol("b", "INTEGER", 1) ] left_one = cc.create("left_one", left_one_cols, {1}) right_one_cols = [ defCol("c", "INTEGER", 1, 2, 3), defCol("d", "INTEGER", 1) ] right_one = cc.create("right_one", right_one_cols, {1}) left_two_cols = [ defCol("a", "INTEGER", 1, 2, 3), defCol("b", "INTEGER", 2) ] left_two = cc.create("left_two", left_two_cols, {2}) right_two_cols = [ defCol("c", "INTEGER", 1, 2, 3), defCol("d", "INTEGER", 2) ] right_two = cc.create("right_two", right_two_cols, {2}) left = cc.concat([left_one, left_two], "left") right = cc.concat([right_one, right_two], "right") joined = cc.join(left, right, "joined", ["a"], ["c"]) cc.collect(joined, 1) return {left_one, left_two, right_one, right_two} dag = rewrite_dag(ccdag.OpDag(protocol())) actual = ScotchCodeGen(CodeGenConfig(), dag)._generate(0, 0) self.check_workflow(actual, 'public_join')
def test_hybrid_join_party_two_opt(self): def protocol(): # define inputs cols_in_1 = [ defCol("a", "INTEGER", [1], [2]), defCol("b", "INTEGER", [1]), ] in_1 = cc.create("in_1", cols_in_1, {1}) cols_in_2 = [ defCol("c", "INTEGER", [2]), defCol("d", "INTEGER", [2]) ] in_2 = cc.create("in_2", cols_in_2, {2}) result = cc.join(in_1, in_2, "result", ["a"], ["c"]) cc.collect(result, 1) # create dag return {in_1, in_2} dag = rewrite_dag(ccdag.OpDag(protocol()), use_leaky_ops=True) actual = ScotchCodeGen(CodeGenConfig(), dag)._generate(0, 0) self.check_workflow(actual, 'hybrid_join_leaky_party_two')
if len(sys.argv) < 5: print( "usage: taxi.py <party ID> <HDFS master node:port> <HDFS root dir> <Spark master url>" ) sys.exit(1) pid = int(sys.argv[1]) hdfs_namenode = sys.argv[2] hdfs_root = sys.argv[3] spark_master_url = sys.argv[4] workflow_name = "job-" + str(pid) sm_config = SharemindCodeGenConfig("/mnt/shared") spark_config = SparkConfig(spark_master_url) conclave_config = CodeGenConfig(workflow_name) \ .with_sharemind_config(sm_config) \ .with_spark_config(spark_config) conclave_config.code_path = "/mnt/shared/" + workflow_name conclave_config.input_path = "hdfs://{}/{}/taxi".format( hdfs_namenode, hdfs_root) conclave_config.output_path = "hdfs://{}/{}/taxi".format( hdfs_namenode, hdfs_root) conclave_config.pid = pid conclave_config.name = workflow_name network_config = { "pid": pid, "parties": { 1: { "host": "ca-spark-node-0", "port": 9001 },
def testHybridJoinWorkflow(): def protocol(): # define inputs colsIn1 = [ defCol("a", "INTEGER", [1]), defCol("b", "INTEGER", [1]), ] in1 = sal.create("in1", colsIn1, set([1])) proj1 = sal.project(in1, "proj1", ["a", "b"]) colsIn2 = [ defCol("c", "INTEGER", [1], [2]), defCol("d", "INTEGER", [2]) ] in2 = sal.create("in2", colsIn2, set([2])) proj2 = sal.project(in2, "proj2", ["c", "d"]) res = sal.join(proj1, proj2, "res", ["a"], ["c"]) # open result to party 1 sal.collect(res, 1) # return roots of dag return set([in1, in2]) pid = int(sys.argv[1]) size = sys.argv[2] workflow_name = "hybrid-join-" + str(pid) sm_cg_config = SharemindCodeGenConfig(workflow_name, "/mnt/shared", use_hdfs=False, use_docker=True) codegen_config = CodeGenConfig(workflow_name).with_sharemind_config( sm_cg_config) codegen_config.pid = pid codegen_config.code_path = "/mnt/shared/" + workflow_name codegen_config.input_path = "/mnt/shared/hybridjoin/" + size codegen_config.output_path = "/mnt/shared/hybridjoin/" + size jobqueue = generate_code(protocol, codegen_config, ["sharemind"], ["python"]) sharemind_config = { "pid": pid, "parties": { 1: { "host": "localhost", "port": 9001 }, 2: { "host": "localhost", "port": 9002 }, 3: { "host": "localhost", "port": 9003 } } } sm_peer = setup_peer(sharemind_config) conclave.dispatch.dispatch_all(None, sm_peer, jobqueue)
def testPublicJoinWorkflow(): @dag_only def protocol(): # define inputs colsInA = [ defCol("a", "INTEGER", [1]), defCol("b", "INTEGER", [1]), ] in1 = sal.create("in1", colsInA, set([1])) in1.isMPC = False proja = sal.project(in1, "proja", ["a", "b"]) proja.isMPC = False proja.out_rel.storedWith = set([1]) colsInB = [ defCol("c", "INTEGER", [1], [2]), defCol("d", "INTEGER", [2]) ] in2 = sal.create("in2", colsInB, set([2])) in2.isMPC = False projb = sal.project(in2, "projb", ["c", "d"]) projb.isMPC = False projb.out_rel.storedWith = set([2]) clA = sal._close(proja, "clA", set([1, 2, 3])) clA.isMPC = True clB = sal._close(projb, "clB", set([1, 2, 3])) clB.isMPC = True persistedA = sal._persist(clA, "persistedA") persistedA.isMPC = True persistedB = sal._persist(clB, "persistedB") persistedB.isMPC = True keysaclosed = sal.project(clA, "keysaclosed", ["a"]) keysaclosed.out_rel.storedWith = set([1, 2, 3]) keysaclosed.isMPC = True keysbclosed = sal.project(clB, "keysbclosed", ["c"]) keysbclosed.isMPC = True keysbclosed.out_rel.storedWith = set([1, 2, 3]) keysa = sal._open(keysaclosed, "keysa", 1) keysa.isMPC = True keysb = sal._open(keysbclosed, "keysb", 1) keysb.isMPC = True indexedA = sal.index(keysa, "indexedA", "indexA") indexedA.isMPC = False indexedA.out_rel.storedWith = set([1]) indexedB = sal.index(keysb, "indexedB", "indexB") indexedB.isMPC = False indexedB.out_rel.storedWith = set([1]) joinedindeces = sal.join( indexedA, indexedB, "joinedindeces", ["a"], ["c"]) joinedindeces.isMPC = False joinedindeces.out_rel.storedWith = set([1]) indecesonly = sal.project( joinedindeces, "indecesonly", ["indexA", "indexB"]) indecesonly.isMPC = False indecesonly.out_rel.storedWith = set([1]) indecesclosed = sal._close( indecesonly, "indecesclosed", set([1, 2, 3])) indecesclosed.isMPC = True joined = sal._index_join(persistedA, persistedB, "joined", [ "a"], ["c"], indecesclosed) joined.out_rel.storedWith = set([1, 2, 3]) joined.isMPC = True sal._open(joined, "opened", 1) # create condag return set([in1, in2]) pid = int(sys.argv[1]) size = sys.argv[2] workflow_name = "public-join-" + str(pid) sm_cg_config = SharemindCodeGenConfig( workflow_name, "/mnt/shared", use_hdfs=False, use_docker=True) codegen_config = CodeGenConfig( workflow_name).with_sharemind_config(sm_cg_config) codegen_config.code_path = "/mnt/shared/" + workflow_name codegen_config.input_path = "/mnt/shared/hybridjoin/" + size codegen_config.output_path = "/mnt/shared/hybridjoin/" + size dag = protocol() mapping = part.heupart(dag, ["sharemind"], ["python"]) job_queue = [] for idx, (fmwk, subdag, storedWith) in enumerate(mapping): if fmwk == "sharemind": job = SharemindCodeGen(codegen_config, subdag, pid).generate( "sharemind-" + str(idx), None) else: job = PythonCodeGen(codegen_config, subdag).generate( "python-" + str(idx), None) # TODO: this probably doesn't belong here if not pid in storedWith: job.skip = True job_queue.append(job) sharemind_config = { "pid": pid, "parties": { 1: {"host": "ca-spark-node-0", "port": 9001}, 2: {"host": "cb-spark-node-0", "port": 9002}, 3: {"host": "cc-spark-node-0", "port": 9003} } } sm_peer = setup_peer(sharemind_config) dispatch_all(None, sm_peer, job_queue)
agg = sal.aggregate(joined, "agg", ["b"], "d", "+", "total") opened = sal._open(agg, "opened", 1) return set([in1, in2, in3]) if __name__ == "__main__": pid = int(sys.argv[1]) workflow_name = "sharemind-ssn-" + str(pid) sm_cg_config = SharemindCodeGenConfig(workflow_name, "/mnt/shared", use_hdfs=False, use_docker=True) codegen_config = CodeGenConfig(workflow_name).with_sharemind_config( sm_cg_config) codegen_config.code_path = "/mnt/shared/" + workflow_name codegen_config.input_path = "/mnt/shared/ssn-data" codegen_config.output_path = "/mnt/shared/ssn-data" job = SharemindCodeGen(codegen_config, protocol(), pid).generate("sharemind-0", "") job_queue = [job] sharemind_config = { "pid": pid, "parties": { 1: { "host": "ca-spark-node-0", "port": 9001 },
def testHybridAggWorkflow(): @dag_only def protocol(): # define inputs colsInA = [ defCol("a", "INTEGER", [1]), defCol("b", "INTEGER", [1]), ] in1 = sal.create("in1", colsInA, set([1])) in1.isMPC = False proja = sal.project(in1, "proja", ["a", "b"]) proja.isMPC = False proja.out_rel.storedWith = set([1]) # define inputs colsInB = [ defCol("a", "INTEGER", [2]), defCol("b", "INTEGER", [2]), ] in2 = sal.create("in2", colsInB, set([2])) in2.isMPC = False projb = sal.project(in2, "projb", ["a", "b"]) projb.isMPC = False projb.out_rel.storedWith = set([2]) # define inputs colsInC = [ defCol("a", "INTEGER", [3]), defCol("b", "INTEGER", [3]), ] in3 = sal.create("in3", colsInC, set([3])) in3.isMPC = False projc = sal.project(in3, "projc", ["a", "b"]) projc.isMPC = False projc.out_rel.storedWith = set([3]) clA = sal._close(proja, "clA", set([1, 2, 3])) clA.isMPC = True clB = sal._close(projb, "clB", set([1, 2, 3])) clB.isMPC = True clC = sal._close(projc, "clC", set([1, 2, 3])) clC.isMPC = True comb = sal.concat([clA, clB, clC], "comb") comb.out_rel.storedWith = set([1, 2, 3]) comb.isMPC = True shuffled = sal.shuffle(comb, "shuffled") shuffled.out_rel.storedWith = set([1, 2, 3]) shuffled.isMPC = True persisted = sal._persist(shuffled, "persisted") persisted.out_rel.storedWith = set([1, 2, 3]) persisted.isMPC = True keysclosed = sal.project(shuffled, "keysclosed", ["a"]) keysclosed.out_rel.storedWith = set([1, 2, 3]) keysclosed.isMPC = True keys = sal._open(keysclosed, "keys", 1) keys.isMPC = True indexed = sal.index(keys, "indexed", "rowIndex") indexed.isMPC = False indexed.out_rel.storedWith = set([1]) sortedByKey = sal.sort_by(indexed, "sortedByKey", "a") sortedByKey.isMPC = False sortedByKey.out_rel.storedWith = set([1]) eqFlags = sal._comp_neighs(sortedByKey, "eqFlags", "a") eqFlags.isMPC = False eqFlags.out_rel.storedWith = set([1]) # TODO: hack to get keys stored # need to fix later! sortedByKey = sal.project(sortedByKey, "sortedByKey", ["rowIndex", "a"]) sortedByKey.isMPC = False sortedByKey.out_rel.storedWith = set([1]) closedEqFlags = sal._close(eqFlags, "closedEqFlags", set([1, 2, 3])) closedEqFlags.isMPC = True closedSortedByKey = sal._close(sortedByKey, "closedSortedByKey", set([1, 2, 3])) closedSortedByKey.isMPC = True agg = sal.index_aggregate(persisted, "agg", ["a"], "b", "+", "b", closedEqFlags, closedSortedByKey) agg.out_rel.storedWith = set([1, 2, 3]) agg.isMPC = True sal._open(agg, "opened", 1) # create condag return set([in1, in2, in3]) pid = int(sys.argv[1]) size = sys.argv[2] workflow_name = "hybrid-agg-" + str(pid) sm_cg_config = SharemindCodeGenConfig(workflow_name, "/mnt/shared", use_hdfs=False, use_docker=True) codegen_config = CodeGenConfig(workflow_name).with_sharemind_config( sm_cg_config) codegen_config.code_path = "/mnt/shared/" + workflow_name codegen_config.input_path = "/mnt/shared/" + size codegen_config.output_path = "/mnt/shared/" + size dag = protocol() mapping = part.heupart(dag, ["sharemind"], ["python"]) job_queue = [] for idx, (fmwk, subdag, storedWith) in enumerate(mapping): if fmwk == "sharemind": job = SharemindCodeGen(codegen_config, subdag, pid).generate("sharemind-" + str(idx), None) else: job = PythonCodeGen(codegen_config, subdag).generate("python-" + str(idx), None) # TODO: this probably doesn't belong here if not pid in storedWith: job.skip = True job_queue.append(job) sharemind_config = { "pid": pid, "parties": { 1: { "host": "ca-spark-node-0", "port": 9001 }, 2: { "host": "cb-spark-node-0", "port": 9002 }, 3: { "host": "cc-spark-node-0", "port": 9003 } } } sm_peer = setup_peer(sharemind_config) dispatch_all(None, sm_peer, job_queue)
'wghtd_unit_p', '+', 'avg_unit_p') # merge in avg_unit_p final_join = sal.join(total_units, total_unit_wghts, 'final_join', ['store_code_uc', 'upc', 'week_end'], ['store_code_uc', 'upc', 'week_end']) selected_cols = sal.project(final_join, 'selected_cols', [ 'store_code_uc', 'upc', 'week_end', 'q', 'avg_unit_p', 'retailer_code', 'store_zip3' ]) opened = sal.collect(selected_cols, 1) return set([create]) if __name__ == "__main__": dag = protocol() config = CodeGenConfig("nielsen-local") vg = viz.VizCodeGen(config, dag) vg.generate("local_workflow", "/tmp") cg = spark.SparkCodeGen(config, dag) cg.generate("local_workflow", "/tmp") print("Spark code generated in {}".format(config.code_path))
defCol("b", "INTEGER", [1]), defCol("c", "INTEGER", [1]), defCol("d", "INTEGER", [1]) ] in1 = sal.create("in1", colsIn1, set([1])) in2 = sal.create("in2", colsIn1, set([1])) return [in1, in2] @dag_only def agg(): in1 = setup()[0] agg = sal.aggregate(in1, "agg", ["a", "b"], "c", "sum", "agg1") out = sal.collect(agg, 1) return set([in1]) if __name__ == "__main__": dag_agg = agg() cfg_agg = CodeGenConfig('agg') cg_agg = SparkCodeGen(cfg_agg, dag_agg) cg_agg.generate('agg', '/tmp')
def main(): pid = sys.argv[1] data_root = sys.argv[2] backend = sys.argv[3] workflow_name = "hhi-benchmark-" + pid if backend == "python": sharemind_conf = SharemindCodeGenConfig("/mnt/shared", use_docker=True, use_hdfs=False) conclave_config = CodeGenConfig(workflow_name, int(pid)) conclave_config.with_sharemind_config(sharemind_conf) conclave_config.code_path = os.path.join("/mnt/shared", workflow_name) conclave_config.input_path = os.path.join("/mnt/shared", data_root) conclave_config.output_path = os.path.join("/mnt/shared", data_root) generate_and_dispatch(protocol, conclave_config, ["sharemind"], ["python"], apply_optimizations=True) elif backend == "spark": sharemind_conf = SharemindCodeGenConfig("/mnt/shared", use_docker=True, use_hdfs=True) conclave_config = CodeGenConfig(workflow_name, int(pid)) host = conclave_config.network_config["parties"][int(pid)]["host"] # Update this if your spark master and HDFS namenode are mapped to a different host than your Conclave node spark_master_url = "spark://{}:7077".format(host) hdfs_namenode = "{}:9000".format(host) spark_config = SparkConfig(spark_master_url) conclave_config \ .with_sharemind_config(sharemind_conf) \ .with_spark_config(spark_config) conclave_config.code_path = os.path.join("/mnt/shared", workflow_name) conclave_config.input_path = "hdfs://{}/{}".format( hdfs_namenode, data_root) conclave_config.output_path = "hdfs://{}/{}".format( hdfs_namenode, data_root) generate_and_dispatch(protocol, conclave_config, ["sharemind"], ["spark"], apply_optimizations=True) else: raise Exception("Unknown backend {}".format(backend))
def testPublicJoinWorkflow(): @dag_only def protocol(): # define inputs colsInA = [ defCol("a", "INTEGER", [1]), defCol("b", "INTEGER", [1]), ] in1 = sal.create("in1", colsInA, set([1])) in1.isMPC = False proja = sal.project(in1, "proja", ["a", "b"]) proja.isMPC = False proja.out_rel.storedWith = set([1]) colsInB = [ defCol("c", "INTEGER", [1], [2]), defCol("d", "INTEGER", [2]) ] in2 = sal.create("in2", colsInB, set([2])) in2.isMPC = False projb = sal.project(in2, "projb", ["c", "d"]) projb.isMPC = False projb.out_rel.storedWith = set([2]) clA = sal._close(proja, "clA", set([1, 2, 3])) clA.isMPC = True clB = sal._close(projb, "clB", set([1, 2, 3])) clB.isMPC = True persistedA = sal._persist(clA, "persistedA") persistedB = sal._persist(clB, "persistedB") keysaclosed = sal.project(clA, "keysaclosed", ["a"]) keysaclosed.out_rel.storedWith = set([1, 2, 3]) keysaclosed.isMPC = True keysbclosed = sal.project(clB, "keysbclosed", ["c"]) keysbclosed.isMPC = True keysbclosed.out_rel.storedWith = set([1, 2, 3]) keysa = sal._open(keysaclosed, "keysa", 1) keysa.isMPC = True keysb = sal._open(keysbclosed, "keysb", 1) keysb.isMPC = True indexedA = sal.index(keysa, "indexedA", "indexA") indexedA.isMPC = False indexedA.out_rel.storedWith = set([1]) indexedB = sal.index(keysb, "indexedB", "indexB") indexedB.isMPC = False indexedB.out_rel.storedWith = set([1]) joinedindeces = sal.join( indexedA, indexedB, "joinedindeces", ["a"], ["c"]) joinedindeces.isMPC = False joinedindeces.out_rel.storedWith = set([1]) indecesonly = sal.project( joinedindeces, "indecesonly", ["indexA", "indexB"]) indecesonly.isMPC = False indecesonly.out_rel.storedWith = set([1]) indecesclosed = sal._close( indecesonly, "indecesclosed", set([1, 2, 3])) indecesclosed.isMPC = True joined = sal._index_join(persistedA, persistedB, "joined", ["a"], ["c"], indecesclosed) joined.isMPC = True sal._open(joined, "opened", 1) # create condag return set([in1, in2]) pid = int(sys.argv[1]) workflow_name = "hybrid-join-" + str(pid) sm_cg_config = SharemindCodeGenConfig( workflow_name, "/mnt/shared", use_hdfs=False) codegen_config = CodeGenConfig( workflow_name).with_sharemind_config(sm_cg_config) codegen_config.code_path = "/mnt/shared/" + workflow_name codegen_config.input_path = "/mnt/shared" codegen_config.output_path = "/mnt/shared" exampleutils.generate_data(pid, codegen_config.output_path) dag = protocol() mapping = part.heupart(dag, ["sharemind"], ["python"]) job_queue = [] for idx, (fmwk, subdag, storedWith) in enumerate(mapping): if fmwk == "sharemind": job = SharemindCodeGen(codegen_config, subdag, pid).generate( "sharemind-" + str(idx), None) else: job = PythonCodeGen(codegen_config, subdag).generate( "python-" + str(idx), None) # TODO: this probably doesn't belong here if not pid in storedWith: job.skip = True job_queue.append(job) sharemind_config = exampleutils.get_sharemind_config(pid, True) sm_peer = setup_peer(sharemind_config) dispatch_all(None, sm_peer, job_queue) if pid == 1: expected = ['', '2,200,2001', '3,300,3001', '4,400,4001', '42,42,1001', '5,500,5001', '6,600,6001', '7,700,7001', '7,800,7001', '7,900,7001', '8,1000,8001', '9,1100,9001'] exampleutils.check_res(expected, "/mnt/shared/opened.csv") print("Success")
def wrap(): code = sharemind.SharemindCodeGen(CodeGenConfig(), f())._generate(None, None) return code
def wrap(): code = scotch.ScotchCodeGen(CodeGenConfig(), f())._generate(None, None) return code
def testHybridJoinWorkflow(): def hybrid_join(): # define inputs colsInA = [ defCol("a", "INTEGER", [1]), defCol("b", "INTEGER", [1]), ] in1 = sal.create("in1", colsInA, set([1])) in1.isMPC = False proja = sal.project(in1, "proja", ["a", "b"]) proja.isMPC = False proja.out_rel.storedWith = set([1]) colsInB = [ defCol("c", "INTEGER", [1], [2]), defCol("d", "INTEGER", [2]) ] in2 = sal.create("in2", colsInB, set([2])) in2.isMPC = False projb = sal.project(in2, "projb", ["c", "d"]) projb.isMPC = False projb.out_rel.storedWith = set([2]) clA = sal._close(proja, "clA", set([1, 2, 3])) clA.isMPC = True clB = sal._close(projb, "clB", set([1, 2, 3])) clB.isMPC = True shuffledA = sal.shuffle(clA, "shuffledA") shuffledA.isMPC = True persistedA = sal._persist(shuffledA, "persistedA") persistedA.isMPC = True shuffledB = sal.shuffle(clB, "shuffledB") shuffledB.isMPC = True persistedB = sal._persist(shuffledB, "persistedB") persistedB.isMPC = True keysaclosed = sal.project(shuffledA, "keysaclosed", ["a"]) keysaclosed.out_rel.storedWith = set([1, 2, 3]) keysaclosed.isMPC = True keysbclosed = sal.project(shuffledB, "keysbclosed", ["c"]) keysbclosed.isMPC = True keysbclosed.out_rel.storedWith = set([1, 2, 3]) keysa = sal._open(keysaclosed, "keysa", 1) keysa.isMPC = True keysb = sal._open(keysbclosed, "keysb", 1) keysb.isMPC = True indexedA = sal.index(keysa, "indexedA", "indexA") indexedA.isMPC = False indexedA.out_rel.storedWith = set([1]) indexedB = sal.index(keysb, "indexedB", "indexB") indexedB.isMPC = False indexedB.out_rel.storedWith = set([1]) joinedindeces = sal.join( indexedA, indexedB, "joinedindeces", ["a"], ["c"]) joinedindeces.isMPC = False joinedindeces.out_rel.storedWith = set([1]) indecesonly = sal.project( joinedindeces, "indecesonly", ["indexA", "indexB"]) indecesonly.isMPC = False indecesonly.out_rel.storedWith = set([1]) indecesclosed = sal._close( indecesonly, "indecesclosed", set([1, 2, 3])) indecesclosed.isMPC = True joined = sal._index_join(persistedA, persistedB, "joined", [ "a"], ["c"], indecesclosed) joined.isMPC = True return joined, set([in1, in2]) def hybrid_agg(in1): shuffled = sal.shuffle(in1, "shuffled") shuffled.out_rel.storedWith = set([1, 2, 3]) shuffled.isMPC = True persisted = sal._persist(shuffled, "persisted") persisted.out_rel.storedWith = set([1, 2, 3]) persisted.isMPC = True keysclosed = sal.project(shuffled, "keysclosed", ["b"]) keysclosed.out_rel.storedWith = set([1, 2, 3]) keysclosed.isMPC = True keys = sal._open(keysclosed, "keys", 1) keys.isMPC = True indexed = sal.index(keys, "indexed", "rowIndex") indexed.isMPC = False indexed.out_rel.storedWith = set([1]) distinctKeys = sal.distinct(keys, "distinctKeys", ["b"]) distinctKeys.isMPC = False distinctKeys.out_rel.storedWith = set([1]) # TODO: hack to get keys stored # need to fix later! fakeDistinctKeys = sal.distinct(keys, "distinctKeys", ["b"]) fakeDistinctKeys.isMPC = False fakeDistinctKeys.out_rel.storedWith = set([1]) indexedDistinct = sal.index(distinctKeys, "indexedDistinct", "keyIndex") indexedDistinct.isMPC = False indexedDistinct.out_rel.storedWith = set([1]) joinedindeces = sal.join( indexed, indexedDistinct, "joinedindeces", ["b"], ["b"]) joinedindeces.isMPC = False joinedindeces.out_rel.storedWith = set([1]) # TODO: could project row indeces away too indecesonly = sal.project( joinedindeces, "indecesonly", ["rowIndex", "keyIndex"]) indecesonly.isMPC = False indecesonly.out_rel.storedWith = set([1]) closedDistinct = sal._close(distinctKeys, "closedDistinct", set([1, 2, 3])) closedDistinct.isMPC = True closedLookup = sal._close(indecesonly, "closedLookup", set([1, 2, 3])) closedLookup.isMPC = True agg = sal.index_aggregate(persisted, "agg", ["b"], "d", "+", "d", closedLookup, closedDistinct) agg.isMPC = True sal._open(agg, "aggopened", 1) def protocol(): joinedres, inputs = hybrid_join() hybrid_agg(joinedres) return saldag.OpDag(inputs) pid = int(sys.argv[1]) workflow_name = "ssn-" + str(pid) sm_cg_config = SharemindCodeGenConfig( workflow_name, "/mnt/shared", use_hdfs=False, use_docker=False) codegen_config = CodeGenConfig( workflow_name).with_sharemind_config(sm_cg_config) codegen_config.code_path = "/mnt/shared/" + workflow_name codegen_config.input_path = "/mnt/shared" codegen_config.output_path = "/mnt/shared" exampleutils.generate_ssn_data(pid, codegen_config.output_path) dag = protocol() mapping = part.heupart(dag, ["sharemind"], ["python"]) job_queue = [] for idx, (fmwk, subdag, storedWith) in enumerate(mapping): if fmwk == "sharemind": job = SharemindCodeGen(codegen_config, subdag, pid).generate( "sharemind-" + str(idx), None) else: job = PythonCodeGen(codegen_config, subdag).generate( "python-" + str(idx), None) # TODO: this probably doesn't belong here if not pid in storedWith: job.skip = True job_queue.append(job) sharemind_config = exampleutils.get_sharemind_config(pid, True) sm_peer = setup_peer(sharemind_config) dispatch_all(None, sm_peer, job_queue) if pid == 1: expected = ['', '1,30', '2,50', '3,30'] exampleutils.check_res(expected, "/mnt/shared/aggopened.csv") print("Success")
def setup(conf: dict): # GENERAL pid = int(conf["user_config"]["pid"]) workflow_name = conf["user_config"]["workflow_name"] all_pids = conf["user_config"]['all_pids'] use_leaky = conf["user_config"]["leaky_ops"] use_floats = conf["user_config"]["use_floats"] conclave_config = CodeGenConfig(workflow_name) # SPARK try: spark_avail = conf["backends"]["spark"]["available"] if spark_avail: spark_master_url = conf["backends"]["spark"]["master_url"] spark_config = SparkConfig(spark_master_url) conclave_config.with_spark_config(spark_config) except KeyError: pass # OBLIV-C try: oc_avail = conf["backends"]["oblivc"]["available"] if oc_avail: oc_path = conf["backends"]["oblivc"]["oc_path"] ip_port = conf["backends"]["oblivc"]["ip_port"] oc_config = OblivcConfig(oc_path, ip_port) conclave_config.with_oc_config(oc_config) except KeyError: pass # JIFF try: jiff_avail = conf["backends"]["jiff"]["available"] if jiff_avail: jiff_path = conf["backends"]["jiff"]["jiff_path"] party_count = len(all_pids) server_ip = conf["backends"]["jiff"]["server_ip"] server_pid = conf["backends"]["jiff"]["server_pid"] server_port = conf["backends"]["jiff"]["server_port"] jiff_config = JiffConfig(jiff_path, party_count, server_ip, server_port, server_pid) conclave_config.with_jiff_config(jiff_config) except KeyError: pass # NET hosts = conf["net"]["parties"] net_config = NetworkConfig(hosts, pid) conclave_config.with_network_config(net_config) conclave_config.pid = pid conclave_config.all_pids = all_pids conclave_config.name = workflow_name conclave_config.use_leaky_ops = use_leaky conclave_config.use_floats = use_floats conclave_config.input_path = conf["user_config"]["paths"]["input_path"] conclave_config.code_path = conf["user_config"]["paths"]["input_path"] conclave_config.output_path = conf["user_config"]["paths"]["input_path"] return conclave_config
sec_4_result = sal.join( wghtd_p_final, wghtd_p_sum, 'sec_4_result', ['store_zip3', 'retailer_code', 'brand_code_bu', 'week_end'], ['store_zip3', 'retailer_code', 'brand_code_bu', 'week_end']) # TODO: filter out sec_4_result rows where 'store_zip3' cell is empty final = sal.project(sec_4_result, 'final', [ 'store_zip3', 'retailer_code', 'week_end', 'brand_code_bu', 'brand_descr_bu', 'q', 'p' ]) opened = sal.collect(final, 1) return set([concatenated_DFs, temp_UPC_brandBU_crspnd]) if __name__ == "__main__": dag = protocol() config = CodeGenConfig("nielsen-main") viz = viz.VizCodeGen(config, dag) viz.generate("main_workflow", "/tmp") cg = spark.SparkCodeGen(config, dag) cg.generate("main_workflow", "/tmp") print("Spark code generated in {}".format(config.code_path))