Beispiel #1
0
def protocol():
    # define inputs
    left_cols = [
        defCol("a", "INTEGER", [1]),
        defCol("b", "INTEGER", [1]),
    ]
    left = cc.create("left", left_cols, {1})
    left_dummy = cc.project(left, "zzz_left_dummy", ["a", "b"])

    right_cols = [
        defCol("c", "INTEGER", [1], [2]),
        defCol("d", "INTEGER", [2])
    ]
    right = cc.create("right", right_cols, {2})
    right_dummy = cc.project(right, "right_dummy", ["c", "d"])

    actual = cc.join(left_dummy, right_dummy, "actual", ["a"], ["c"])

    cc.collect(actual, 1)
    # create dag
    return {left, right}
Beispiel #2
0
def protocol():

    # define inputs
    colsIn1 = [defCol("a", "INTEGER", [1]), defCol("b", "INTEGER", [1])]
    in1 = sal.create("govreg", colsIn1, set([1]))
    colsIn2 = [defCol("c", "INTEGER", [2]), defCol("d", "INTEGER", [2])]
    in2 = sal.create("company0", colsIn2, set([2]))
    colsIn3 = [defCol("c", "INTEGER", [3]), defCol("d", "INTEGER", [3])]
    in3 = sal.create("company1", colsIn3, set([3]))

    cl1 = sal._close(in1, "cl1", set([1, 2, 3]))
    projA = sal.project(cl1, "projA", ["a", "b"])
    cl2 = sal._close(in2, "cl2", set([1, 2, 3]))
    cl3 = sal._close(in3, "cl3", set([1, 2, 3]))
    right_rel = sal.concat([cl2, cl3], "right_rel")
    projB = sal.project(right_rel, "projB", ["c", "d"])

    joined = sal.join(projA, right_rel, "joined", ["a"], ["c"])
    agg = sal.aggregate(joined, "agg", ["b"], "d", "+", "total")

    opened = sal._open(agg, "opened", 1)
    return set([in1, in2, in3])
Beispiel #3
0
def protocol():
    govreg_cols = [defCol("a", "INTEGER", [1]), defCol("b", "INTEGER", [1])]
    govreg = cc.create("govreg", govreg_cols, {1})

    company0_cols = [
        defCol("c", "INTEGER", [1], [2]),
        defCol("d", "INTEGER", [2])
    ]
    company0 = cc.create("company0", company0_cols, {2})

    company1_cols = [
        defCol("c", "INTEGER", [1], [3]),
        defCol("d", "INTEGER", [3])
    ]
    company1 = cc.create("company1", company1_cols, {3})

    companies = cc.concat([company0, company1], "companies")

    joined = cc.join(govreg, companies, "joined", ["a"], ["c"])
    actual = cc.aggregate(joined, "actual", ["b"], "d", "sum", "total")
    cc.collect(actual, 1)

    return {govreg, company0, company1}
Beispiel #4
0
    def protocol():

        # define inputs
        colsIn1 = [
            defCol("a", "INTEGER", [1]),
            defCol("b", "INTEGER", [1]),
        ]
        in1 = sal.create("in1", colsIn1, set([1]))
        proj1 = sal.project(in1, "proj1", ["a", "b"])

        colsIn2 = [
            defCol("c", "INTEGER", [1], [2]),
            defCol("d", "INTEGER", [2])
        ]
        in2 = sal.create("in2", colsIn2, set([2]))
        proj2 = sal.project(in2, "proj2", ["c", "d"])

        res = sal.join(proj1, proj2, "res", ["a"], ["c"])

        # open result to party 1
        sal.collect(res, 1)

        # return roots of dag
        return set([in1, in2])
Beispiel #5
0
def protocol(all_pids: list):
    pid_col_meds = "0"
    med_col_meds = "4"
    date_col_meds = "7"

    pid_col_diags = "8"
    diag_col_diags = "16"
    date_col_diags = "18"

    num_med_cols = 8
    num_diag_cols = 13

    left_medication_cols = [
        defCol(str(i), "INTEGER", 1) for i in range(num_med_cols)
    ]
    # public PID column
    left_medication_cols[0] = defCol(pid_col_meds, "INTEGER", all_pids)
    left_medication = cc.create("left_medication", left_medication_cols, {1})

    left_diagnosis_cols = [
        defCol(str(i + num_med_cols), "INTEGER", 1)
        for i in range(num_diag_cols)
    ]
    # public PID column
    left_diagnosis_cols[0] = defCol(pid_col_diags, "INTEGER", all_pids)
    left_diagnosis = cc.create("left_diagnosis", left_diagnosis_cols, {1})

    right_medication_cols = [
        defCol(str(i), "INTEGER", 2) for i in range(num_med_cols)
    ]
    # public PID column
    right_medication_cols[0] = defCol(pid_col_meds, "INTEGER", all_pids)
    right_medication = cc.create("right_medication", right_medication_cols,
                                 {2})

    right_diagnosis_cols = [
        defCol(str(i + num_med_cols), "INTEGER", 2)
        for i in range(num_diag_cols)
    ]
    # public PID column
    right_diagnosis_cols[0] = defCol(pid_col_diags, "INTEGER", all_pids)
    right_diagnosis = cc.create("right_diagnosis", right_diagnosis_cols, {2})

    medication = cc.concat([left_medication, right_medication], "medication")
    diagnosis = cc.concat([left_diagnosis, right_diagnosis], "diagnosis")

    # only keep relevant columns
    medication_proj = cc.project(medication, "medication_proj",
                                 [pid_col_meds, med_col_meds, date_col_meds])
    diagnosis_proj = cc.project(
        diagnosis, "diagnosis_proj",
        [pid_col_diags, diag_col_diags, date_col_diags])

    joined = cc.join(medication_proj, diagnosis_proj, "joined", [pid_col_meds],
                     [pid_col_diags])

    cases = cc.cc_filter(joined,
                         "cases",
                         date_col_diags,
                         "<",
                         other_col_name=date_col_meds)
    aspirin = cc.cc_filter(cases, "aspirin", med_col_meds, "==", scalar=1)
    heart_patients = cc.cc_filter(aspirin,
                                  "heart_patients",
                                  diag_col_diags,
                                  "==",
                                  scalar=1)

    cc.collect(cc.distinct_count(heart_patients, "actual", pid_col_meds), 1)

    return {left_medication, left_diagnosis, right_medication, right_diagnosis}
Beispiel #6
0
def protocol_mpc(all_pids: list):
    pid_col_meds = "0"
    med_col_meds = "4"
    date_col_meds = "7"

    pid_col_diags = "8"
    diag_col_diags = "16"
    date_col_diags = "18"

    num_med_cols = 8
    num_diag_cols = 13

    left_medication_cols = [defCol(str(i), "INTEGER", 1) for i in range(num_med_cols)]
    # public PID column
    left_medication_cols[0] = defCol(pid_col_meds, "INTEGER", all_pids)
    left_medication = cc.create("left_medication", left_medication_cols, {1})

    left_diagnosis_cols = [defCol(str(i + num_med_cols), "INTEGER", 1) for i in range(num_diag_cols)]
    # public PID column
    left_diagnosis_cols[0] = defCol(pid_col_diags, "INTEGER", all_pids)
    left_diagnosis = cc.create("left_diagnosis", left_diagnosis_cols, {1})

    right_medication_cols = [defCol(str(i), "INTEGER", 2) for i in range(num_med_cols)]
    # public PID column
    right_medication_cols[0] = defCol(pid_col_meds, "INTEGER", all_pids)
    right_medication = cc.create("right_medication", right_medication_cols, {2})

    right_diagnosis_cols = [defCol(str(i + num_med_cols), "INTEGER", 2) for i in range(num_diag_cols)]
    # public PID column
    right_diagnosis_cols[0] = defCol(pid_col_diags, "INTEGER", all_pids)
    right_diagnosis = cc.create("right_diagnosis", right_diagnosis_cols, {2})

    # Manual slicing
    left_keys = cc.union(left_medication, left_diagnosis, "left_pids", pid_col_meds, pid_col_diags)
    right_keys = cc.union(right_medication, right_diagnosis, "right_pids", pid_col_meds, pid_col_diags)

    left_shared_pids = cc._pub_intersect(left_keys, "a_left_shared_pids", pid_col_meds)
    cc._persist(left_shared_pids, "a_left_shared_pids")
    right_shared_pids = cc._pub_intersect(right_keys, "a_right_shared_pids", pid_col_meds, is_server=False)
    cc._persist(right_shared_pids, "a_right_shared_pids")

    left_medication_proj = cc.project(left_medication, "left_medication_proj",
                                      [pid_col_meds, med_col_meds, date_col_meds])
    left_medication_shared = cc.filter_by(left_medication_proj, "left_medication_shared", pid_col_meds,
                                          left_shared_pids)

    left_diagnosis_proj = cc.project(left_diagnosis, "left_diagnosis_proj",
                                     [pid_col_diags, diag_col_diags, date_col_diags])
    left_diagnosis_shared = cc.filter_by(left_diagnosis_proj, "left_diagnosis_shared", pid_col_diags, left_shared_pids)

    right_medication_proj = cc.project(right_medication, "right_medication_proj",
                                       [pid_col_meds, med_col_meds, date_col_meds])
    right_medication_shared = cc.filter_by(right_medication_proj, "right_medication_shared", pid_col_meds,
                                           right_shared_pids)

    right_diagnosis_proj = cc.project(right_diagnosis, "right_diagnosis_proj",
                                      [pid_col_diags, diag_col_diags, date_col_diags])
    right_diagnosis_shared = cc.filter_by(right_diagnosis_proj, "right_diagnosis_shared", pid_col_diags,
                                          right_shared_pids)

    # Slicing done
    medication_shared = cc.concat([left_medication_shared, right_medication_shared], "medication_shared")
    diagnosis_shared = cc.concat([left_diagnosis_shared, right_diagnosis_shared], "diagnosis_shared")

    joined = cc.join(medication_shared, diagnosis_shared, "joined", [pid_col_meds], [pid_col_diags])
    cases = cc.cc_filter(joined, "cases", date_col_diags, "<", other_col_name=date_col_meds)
    aspirin = cc.cc_filter(cases, "aspirin", med_col_meds, "==", scalar=1)
    heart_patients = cc.cc_filter(aspirin, "heart_patients", diag_col_diags, "==", scalar=1)

    cc.collect(cc.distinct_count(heart_patients, "actual_mpc", pid_col_meds), 1)

    return {
        left_medication,
        left_diagnosis,
        right_medication,
        right_diagnosis
    }
Beispiel #7
0
    def protocol():

        # define inputs
        colsInA = [
            defCol("a", "INTEGER", [1]),
            defCol("b", "INTEGER", [1]),
        ]
        in1 = sal.create("in1", colsInA, set([1]))
        in1.isMPC = False

        proja = sal.project(in1, "proja", ["a", "b"])
        proja.isMPC = False
        proja.out_rel.storedWith = set([1])

        colsInB = [
            defCol("c", "INTEGER", [1], [2]),
            defCol("d", "INTEGER", [2])
        ]
        in2 = sal.create("in2", colsInB, set([2]))
        in2.isMPC = False

        projb = sal.project(in2, "projb", ["c", "d"])
        projb.isMPC = False
        projb.out_rel.storedWith = set([2])

        clA = sal._close(proja, "clA", set([1, 2, 3]))
        clA.isMPC = True
        clB = sal._close(projb, "clB", set([1, 2, 3]))
        clB.isMPC = True

        persistedA = sal._persist(clA, "persistedA")
        persistedB = sal._persist(clB, "persistedB")

        keysaclosed = sal.project(clA, "keysaclosed", ["a"])
        keysaclosed.out_rel.storedWith = set([1, 2, 3])
        keysaclosed.isMPC = True
        keysbclosed = sal.project(clB, "keysbclosed", ["c"])
        keysbclosed.isMPC = True
        keysbclosed.out_rel.storedWith = set([1, 2, 3])

        keysa = sal._open(keysaclosed, "keysa", 1)
        keysa.isMPC = True
        keysb = sal._open(keysbclosed, "keysb", 1)
        keysb.isMPC = True

        indexedA = sal.index(keysa, "indexedA", "indexA")
        indexedA.isMPC = False
        indexedA.out_rel.storedWith = set([1])
        indexedB = sal.index(keysb, "indexedB", "indexB")
        indexedB.isMPC = False
        indexedB.out_rel.storedWith = set([1])

        joinedindeces = sal.join(
            indexedA, indexedB, "joinedindeces", ["a"], ["c"])
        joinedindeces.isMPC = False
        joinedindeces.out_rel.storedWith = set([1])

        indecesonly = sal.project(
            joinedindeces, "indecesonly", ["indexA", "indexB"])
        indecesonly.isMPC = False
        indecesonly.out_rel.storedWith = set([1])

        indecesclosed = sal._close(
            indecesonly, "indecesclosed", set([1, 2, 3]))
        indecesclosed.isMPC = True

        joined = sal._index_join(persistedA, persistedB, "joined",
                                 ["a"], ["c"], indecesclosed)
        joined.isMPC = True

        sal._open(joined, "opened", 1)

        # create condag
        return set([in1, in2])
Beispiel #8
0
    def hybrid_join():
        # define inputs
        colsInA = [
            defCol("a", "INTEGER", [1]),
            defCol("b", "INTEGER", [1]),
        ]
        in1 = sal.create("govreg", colsInA, {1})
        in1.isMPC = False

        proja = sal.project(in1, "proja", ["a", "b"])
        proja.isMPC = False
        proja.out_rel.storedWith = {1}

        colsInB = [
            defCol("c", "INTEGER", [1], [2]),
            defCol("d", "INTEGER", [2])
        ]
        in2 = sal.create("company0", colsInB, {2})
        in2.isMPC = False

        projb = sal.project(in2, "projb", ["c", "d"])
        projb.isMPC = False
        projb.out_rel.storedWith = {2}

        colsInC = [
            defCol("c", "INTEGER", [1], [3]),
            defCol("d", "INTEGER", [3])
        ]
        in3 = sal.create("company1", colsInC, {3})
        in3.isMPC = False

        projc = sal.project(in3, "projc", ["c", "d"])
        projc.isMPC = False
        projc.out_rel.storedWith = {3}

        clA = sal._close(proja, "clA", {1, 2, 3})
        clA.isMPC = True
        clB = sal._close(projb, "clB", {1, 2, 3})
        clB.isMPC = True
        clC = sal._close(projc, "clC", {1, 2, 3})
        clC.isMPC = True

        rightClosed = sal.concat([clB, clC], "clD")
        rightClosed.isMPC = True
        rightClosed.out_rel.storedWith = {1, 2, 3}

        shuffledA = sal.shuffle(clA, "shuffledA")
        shuffledA.isMPC = True
        persistedA = sal._persist(shuffledA, "persistedA")
        persistedA.isMPC = True
        shuffledB = sal.shuffle(rightClosed, "shuffledB")
        shuffledB.isMPC = True
        persistedB = sal._persist(shuffledB, "persistedB")
        persistedB.isMPC = True

        keysaclosed = sal.project(shuffledA, "keysaclosed", ["a"])
        keysaclosed.out_rel.storedWith = {1, 2, 3}
        keysaclosed.isMPC = True
        keysbclosed = sal.project(shuffledB, "keysbclosed", ["c"])
        keysbclosed.isMPC = True
        keysbclosed.out_rel.storedWith = {1, 2, 3}

        keysa = sal._open(keysaclosed, "keysa", 1)
        keysa.isMPC = True
        keysb = sal._open(keysbclosed, "keysb", 1)
        keysb.isMPC = True

        indexedA = sal.index(keysa, "indexedA", "indexA")
        indexedA.isMPC = False
        indexedA.out_rel.storedWith = {1}
        indexedB = sal.index(keysb, "indexedB", "indexB")
        indexedB.isMPC = False
        indexedB.out_rel.storedWith = {1}

        joinedindeces = sal.join(indexedA, indexedB, "joinedindeces", ["a"],
                                 ["c"])
        joinedindeces.isMPC = False
        joinedindeces.out_rel.storedWith = {1}

        indecesonly = sal.project(joinedindeces, "indecesonly",
                                  ["indexA", "indexB"])
        indecesonly.isMPC = False
        indecesonly.out_rel.storedWith = {1}

        indecesclosed = sal._close(indecesonly, "indecesclosed", {1, 2, 3})
        indecesclosed.isMPC = True

        joined = sal._index_join(persistedA, persistedB, "joined", ["a"],
                                 ["c"], indecesclosed)
        joined.isMPC = True

        return joined, {in1, in2, in3}
Beispiel #9
0
def protocol():

    cols_concatenated_DFs = [
        defCol("store_code_uc", "STRING", [1]),
        defCol('upc', 'STRING', [1]),
        defCol('week_end', 'STRING', [1]),
        defCol('q', 'INTEGER', [1]),
        defCol('avg_unit_p', 'FLOAT', [1]),
        defCol('retailer_code', 'STRING', [1]),
        defCol('store_zip3', 'STRING', [1])
    ]

    cols_temp_UPC_brandBU_crspnd = [
        defCol('brand_code_bu', 'STRING', [2]),
        defCol('brand_descr_bu', 'STRING', [2]),
        defCol('upc', 'STRING', [2]),
        defCol('size1_amount', 'FLOAT', [2]),
    ]

    # concatenated DFs from local_workflow.py
    concatenated_DFs = sal.create('concatenated_DFs', cols_concatenated_DFs,
                                  set([1]))

    # the output of preprocess_products.py
    temp_UPC_brandBU_crspnd = sal.create('temp_UPC_brandBU_crspnd',
                                         cols_temp_UPC_brandBU_crspnd,
                                         set([1]))
    '''
    SECTION 1
    Compute the quantity weighted average price per unit
    & total quantity sold at the store-brand level
    '''
    w_upc = sal.join(concatenated_DFs, temp_UPC_brandBU_crspnd, 'w_upc',
                     ['upc'], ['upc'])
    w_avg_OZ_p = sal.divide(w_upc, 'w_avg_OZ_p', 'avg_OZ_p',
                            ['avg_unit_p', 'size1_amount'])
    w_q_upd = sal.multiply(w_avg_OZ_p, 'w_q_upd', 'q', ['q', 'size1_amount'])
    brand_OZq_sum = sal.aggregate(
        w_q_upd, 'brand_OZq_sum',
        ['store_code_uc', 'brand_code_bu', 'week_end'], 'q', '+', 'brand_OZq')
    total_brnd_OZq = sal.join(w_q_upd, brand_OZq_sum, 'total_brnd_OZq',
                              ['store_code_uc', 'brand_code_bu', 'week_end'],
                              ['store_code_uc', 'brand_code_bu', 'week_end'])
    w_wghtd_OZ_brnd_p = sal.multiply(total_brnd_OZq, 'w_wghtd_OZ_brnd_p',
                                     'wghtd_OZ_brnd_p', ['q', 'avg_OZ_p'])
    w_wghtd_OZ_brnd_p_final = sal.divide(w_wghtd_OZ_brnd_p,
                                         'w_wghtd_OZ_brnd_p_final',
                                         'wghtd_OZ_brnd_p',
                                         ['wghtd_OZ_brnd_p', 'brand_OZq'])
    brnd_p_sum = sal.aggregate(w_wghtd_OZ_brnd_p_final, 'brnd_p_sum',
                               ['store_code_uc', 'brand_code_bu', 'week_end'],
                               'wghtd_OZ_brnd_p', '+', 'avg_OZ_brnd_p')
    result = sal.join(brnd_p_sum, w_wghtd_OZ_brnd_p_final, 'result',
                      ['store_code_uc', 'brand_code_bu', 'week_end'],
                      ['store_code_uc', 'brand_code_bu', 'week_end'])
    section_one_result = sal.project(result, 'section_one_result', [
        "avg_OZ_brnd_p", "week_end", "store_code_uc", "brand_code_bu",
        "brand_descr_bu", "brand_OZq", 'retailer_code', 'store_zip3', 'q'
    ])
    '''
    SECTION 2
    Compute the average price per OZ & total OZs sold for each brand at the
    retailer-$geo_unit level, by compiling the store level data that comprises each
    retailer-$geo_unit. Compute the total quantity sold by each retailer-$geo_unit
    '''

    temp_sum = sal.aggregate(
        section_one_result, 'temp_sum',
        ['store_zip3', 'retailer_code', 'brand_code_bu', 'week_end'],
        'brand_OZq', '+', 'brand_OZq')
    result_brnd_sum = sal.join(
        section_one_result, temp_sum, 'result_brnd_sum',
        ['store_zip3', 'retailer_code', 'brand_code_bu', 'week_end'],
        ['store_zip3', 'retailer_code', 'brand_code_bu', 'week_end'])
    wghtd_p_mult = sal.multiply(result_brnd_sum, 'wghtd_p_mult', 'wghtd_p',
                                ['brand_OZq', 'avg_OZ_brnd_p'])
    wghtd_p_final = sal.divide(wghtd_p_mult, 'wghtd_p_final', 'wghtd_p',
                               ['wghtd_p', 'q'])
    wghtd_p_sum = sal.aggregate(
        wghtd_p_final, 'wghtd_p_sum',
        ['store_zip3', 'retailer_code', 'brand_code_bu', 'week_end'],
        'wghtd_p', '+', 'p')
    sec_4_result = sal.join(
        wghtd_p_final, wghtd_p_sum, 'sec_4_result',
        ['store_zip3', 'retailer_code', 'brand_code_bu', 'week_end'],
        ['store_zip3', 'retailer_code', 'brand_code_bu', 'week_end'])

    # TODO: filter out sec_4_result rows where 'store_zip3' cell is empty

    final = sal.project(sec_4_result, 'final', [
        'store_zip3', 'retailer_code', 'week_end', 'brand_code_bu',
        'brand_descr_bu', 'q', 'p'
    ])

    opened = sal.collect(final, 1)

    return set([concatenated_DFs, temp_UPC_brandBU_crspnd])
Beispiel #10
0
        def hybrid_join():

            # define inputs
            cols_in_a = [
                defCol("a", "INTEGER", [1]),
                defCol("b", "INTEGER", [1]),
            ]
            in_1 = sal.create("govreg", cols_in_a, set([1]))
            in_1.is_mpc = False

            proj_a = sal.project(in_1, "proj_a", ["a", "b"])
            proj_a.is_mpc = False
            proj_a.out_rel.stored_with = set([1])

            cols_in_b = [
                defCol("c", "INTEGER", [1], [2]),
                defCol("d", "INTEGER", [2])
            ]
            in_2 = sal.create("company0", cols_in_b, set([2]))
            in_2.is_mpc = False

            proj_b = sal.project(in_2, "proj_b", ["c", "d"])
            proj_b.is_mpc = False
            proj_b.out_rel.stored_with = set([2])

            cols_in_c = [
                defCol("c", "INTEGER", [1], [3]),
                defCol("d", "INTEGER", [3])
            ]
            in_3 = sal.create("company1", cols_in_c, set([3]))
            in_3.is_mpc = False

            proj_c = sal.project(in_3, "proj_c", ["c", "d"])
            proj_c.is_mpc = False
            proj_c.out_rel.stored_with = set([3])

            cl_a = sal._close(proj_a, "cl_a", set([1, 2, 3]))
            cl_a.is_mpc = True
            cl_b = sal._close(proj_b, "cl_b", set([1, 2, 3]))
            cl_b.is_mpc = True
            cl_c = sal._close(proj_c, "cl_c", set([1, 2, 3]))
            cl_c.is_mpc = True

            right_closed = sal.concat([cl_b, cl_c], "clD")
            right_closed.is_mpc = True
            right_closed.out_rel.stored_with = set([1, 2, 3])

            shuffled_a = sal.shuffle(cl_a, "shuffled_a")
            shuffled_a.is_mpc = True
            persisted_a = sal._persist(shuffled_a, "persisted_a")
            persisted_a.is_mpc = True
            shuffled_b = sal.shuffle(right_closed, "shuffled_b")
            shuffled_b.is_mpc = True
            persisted_b = sal._persist(shuffled_b, "persisted_b")
            persisted_b.is_mpc = True

            keys_a_closed = sal.project(shuffled_a, "keys_a_closed", ["a"])
            keys_a_closed.out_rel.stored_with = set([1, 2, 3])
            keys_a_closed.is_mpc = True
            keys_b_closed = sal.project(shuffled_b, "keys_b_closed", ["c"])
            keys_b_closed.is_mpc = True
            keys_b_closed.out_rel.stored_with = set([1, 2, 3])

            keys_a = sal._open(keys_a_closed, "keys_a", 1)
            keys_a.is_mpc = True
            keys_b = sal._open(keys_b_closed, "keys_b", 1)
            keys_b.is_mpc = True

            indexed_a = sal.index(keys_a, "indexed_a", "index_a")
            indexed_a.is_mpc = False
            indexed_a.out_rel.stored_with = set([1])
            indexed_b = sal.index(keys_b, "indexed_b", "index_b")
            indexed_b.is_mpc = False
            indexed_b.out_rel.stored_with = set([1])

            joined_indeces = sal.join(
                indexed_a, indexed_b, "joined_indeces", ["a"], ["c"])
            joined_indeces.is_mpc = False
            joined_indeces.out_rel.stored_with = set([1])

            indeces_only = sal.project(
                joined_indeces, "indeces_only", ["index_a", "index_b"])
            indeces_only.is_mpc = False
            indeces_only.out_rel.stored_with = set([1])

            indeces_closed = sal._close(
                indeces_only, "indeces_closed", set([1, 2, 3]))
            indeces_closed.is_mpc = True

            joined = sal._index_join(persisted_a, persisted_b, "joined", [
                "a"], ["c"], indeces_closed)
            joined.is_mpc = True

            return joined, set([in_1, in_2, in_3])
Beispiel #11
0
def protocol_local(suffix: str, pid: int):
    pid_col_meds = "0"
    med_col_meds = "4"
    date_col_meds = "7"

    pid_col_diags = "8"
    diag_col_diags = "16"
    date_col_diags = "18"

    num_med_cols = 8
    num_diag_cols = 13

    left_medication_cols = [
        defCol(str(i), "INTEGER", pid) for i in range(num_med_cols)
    ]
    medication = cc.create(suffix + "_medication", left_medication_cols, {pid})
    left_diagnosis_cols = [
        defCol(str(i + num_med_cols), "INTEGER", pid)
        for i in range(num_diag_cols)
    ]
    diagnosis = cc.create(suffix + "_diagnosis", left_diagnosis_cols, {pid})

    shared_pids = cc.create("a_{}_shared_pids".format(suffix),
                            [defCol(pid_col_meds, "INTEGER", pid)], {pid})

    # only keep relevant columns
    medication_proj = cc.project(medication, "medication_proj",
                                 [pid_col_meds, med_col_meds, date_col_meds])
    medication_mine = cc.filter_by(medication_proj,
                                   "medication_mine",
                                   pid_col_meds,
                                   shared_pids,
                                   use_not_in=True)

    diagnosis_proj = cc.project(
        diagnosis, "diagnosis_proj",
        [pid_col_diags, diag_col_diags, date_col_diags])
    diagnosis_mine = cc.filter_by(diagnosis_proj,
                                  "diagnosis_mine",
                                  pid_col_diags,
                                  shared_pids,
                                  use_not_in=True)

    joined = cc.join(medication_mine, diagnosis_mine, "joined", [pid_col_meds],
                     [pid_col_diags])

    cases = cc.cc_filter(joined,
                         "cases",
                         date_col_diags,
                         "<",
                         other_col_name=date_col_meds)
    aspirin = cc.cc_filter(cases, "aspirin", med_col_meds, "==", scalar=1)
    heart_patients = cc.cc_filter(aspirin,
                                  "heart_patients",
                                  diag_col_diags,
                                  "==",
                                  scalar=1)

    cc.distinct_count(heart_patients, "actual_" + suffix, pid_col_meds)

    return {medication, diagnosis}