def method3(acc_matrix): df_train = pd.read_csv ('temp/adult_binary_train_prediction0.csv') # df_train = pd.concat ([df_train] * 10, ignore_index=True) train = DataSet (df_train) df_test = pd.read_csv ('temp/adult_binary_test_prediction0.csv') df_test = pd.concat ([df_test] * 3, ignore_index=True) test = DataSet (df_test) acc = [] for name in ['LR', 'SVM']: probabilistic_cbn = load_xml_to_cbn (os.path.join (src_path, '../data/adult/adult.xml')) def find_condition_prob(e, t): return probabilistic_cbn.find_prob (e, t) def get_loc(e): return probabilistic_cbn.get_loc (e) A1 = probabilistic_cbn.v['age'] A2 = probabilistic_cbn.v['education'] S = probabilistic_cbn.v['sex'] M1 = probabilistic_cbn.v['workclass'] M2 = probabilistic_cbn.v['marital-status'] N = probabilistic_cbn.v['hours'] Y = probabilistic_cbn.v['income'] YH = Variable (name=name, index=Y.index + 1, domains=Y.domains) probabilistic_cbn.v[(YH.index, YH.name)] = YH YT = Variable (name=name + "M", index=Y.index + 2, domains=Y.domains) probabilistic_cbn.v[(YT.index, YT.name)] = YT # build linear loss function C_vector = np.zeros ((2 ** 8 + 2 ** 8 // 4, 1)) for a1, a2, n, m1, m2, s in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), M1.domains.get_all (), M2.domains.get_all (), S.domains.get_all ()): p_x_s = train.get_marginal_prob (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s})) p_yh_1_y = p_x_s * train.count (Event ({Y: 0, YH: 0}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}), 'notequal') loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: 0, YT: 0})) C_vector[loc] = p_yh_1_y * train.get_conditional_prob (Event ({YH: 0}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s})) loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: 1, YT: 1})) C_vector[loc] = p_yh_1_y * train.get_conditional_prob (Event ({YH: 1}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s})) p_yh__y = p_x_s * train.count (Event ({Y: 0, YH: 0}), Event ({A1: a1, A2: a2, M1: m1, M2: m2, N: n, S: s}), 'equal') loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: 0, YT: 1})) C_vector[loc] = p_yh__y * train.get_conditional_prob (Event ({YH: 0}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s})) loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: 1, YT: 0})) C_vector[loc] = p_yh__y * train.get_conditional_prob (Event ({YH: 1}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s})) # the inequality of max and min G_matrix_1 = np.zeros ((2 ** 8, 2 ** 8 + 2 ** 8 // 4)) h_1 = np.zeros (2 ** 8) # max i = 0 for a1, a2, n, s, yt in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), S.domains.get_all (), YT.domains.get_all ()): for m1, m2 in product (M1.domains.get_all (), M2.domains.get_all ()): for yh in YH.domains.get_all (): loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt})) G_matrix_1[i, loc] = train.get_conditional_prob (Event ({YH: yh}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s})) loc = get_loc (Event ({A1: a1, A2: a2, N: n, S: s, YT: yt})) G_matrix_1[i, 2 ** 8 + loc] = -1 i += 1 # min assert i == 2 ** 8 // 2 for a1, a2, n, s, yt in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), S.domains.get_all (), YT.domains.get_all ()): for m1, m2 in product (M1.domains.get_all (), M2.domains.get_all ()): for yh in YH.domains.get_all (): loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt})) G_matrix_1[i, loc] = -train.get_conditional_prob (Event ({YH: yh}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s})) loc = get_loc (Event ({A1: a1, A2: a2, N: n, S: s, YT: yt})) G_matrix_1[i, 2 ** 8 + 2 ** 8 // 8 + loc] = 1 i += 1 # build counterfactual fairness constraints G_matrix_2 = np.zeros ((2 ** 4 * 2, 2 ** 8 + 2 ** 8 // 4)) h_2 = np.ones (2 ** 4 * 2) * tau i = 0 for a1, a2, m1, m2 in product (A1.domains.get_all (), A2.domains.get_all (), M1.domains.get_all (), M2.domains.get_all ()): for n in N.domains.get_all (): loc = get_loc (Event ({A1: a1, A2: a2, N: n, S: spos, YT: yt_pos})) G_matrix_2[i, 2 ** 8 + loc] = find_condition_prob (Event ({N: n}), Event ({A1: a1, A2: a2, M1: m1, M2: m2, S: spos})) for yh in YH.domains.get_all (): loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: sneg, YH: yh, YT: yt_pos})) G_matrix_2[i, loc] = -find_condition_prob (Event ({N: n}), Event ({A1: a1, A2: a2, M1: m1, M2: m2, S: sneg})) \ * train.get_conditional_prob (Event ({YH: yh}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: sneg})) i += 1 assert i == 2 ** 4 for a1, a2, m1, m2 in product (A1.domains.get_all (), A2.domains.get_all (), M1.domains.get_all (), M2.domains.get_all ()): for n in N.domains.get_all (): loc = get_loc (Event ({A1: a1, A2: a2, N: n, S: spos, YT: yt_pos})) G_matrix_2[i, 2 ** 8 + 2 ** 8 // 8 + loc] = -find_condition_prob (Event ({N: n}), Event ({A1: a1, A2: a2, M1: m1, M2: m2, S: spos})) for yh in YH.domains.get_all (): loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: sneg, YH: yh, YT: yt_pos})) G_matrix_2[i, loc] = find_condition_prob (Event ({N: n}), Event ({A1: a1, A2: a2, M1: m1, M2: m2, S: sneg})) \ * train.get_conditional_prob (Event ({YH: yh}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: sneg})) i += 1 ########### # mapping in [0, 1] G_matrix_3 = np.zeros ((2 * (2 ** 8 + 2 ** 8 // 4), 2 ** 8 + 2 ** 8 // 4)) h_3 = np.zeros (2 * (2 ** 8 + 2 ** 8 // 4)) for i in range (2 ** 8 + 2 ** 8 // 4): G_matrix_3[i, i] = 1 h_3[i] = 1 G_matrix_3[2 ** 8 + 2 ** 8 // 4 + i, i] = -1 h_3[2 ** 8 + 2 ** 8 // 4 + i] = 0 # sum = 1 A_matrix = np.zeros ((2 ** 8 // 2, 2 ** 8 + 2 ** 8 // 4)) b = np.ones (2 ** 8 // 2) i = 0 for a1, a2, n, m1, m2, s, yh in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), M1.domains.get_all (), M2.domains.get_all (), S.domains.get_all (), YH.domains.get_all ()): for yt in YT.domains.get_all (): A_matrix[i, get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt}))] = 1 i += 1 assert i == 2 ** 8 // 2 # combine the inequality constraints G_matrix = np.vstack ([G_matrix_1, G_matrix_2, G_matrix_3]) h = np.hstack ([h_1, h_2, h_3]) # Test # print (np.linalg.matrix_rank (A_matrix), A_matrix.shape[0]) # print (np.linalg.matrix_rank (np.vstack ([A_matrix, G_matrix])), A_matrix.shape[1]) # def check(): # sol = np.zeros (2 ** 8 + 2 ** 8 // 4) # for a1, a2, n, m1, m2, s, yh, yt in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), M1.domains.get_all (), M2.domains.get_all (), # S.domains.get_all (), YH.domains.get_all (), YT.domains.get_all ()): # if yh.name == yt.name: # sol[get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt}))] = 1.0 # else: # sol[get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt}))] = 0.0 # # for a1, a2, n, s, yt in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), S.domains.get_all (), YT.domains.get_all ()): # p_min = 1 # p_max = 0 # for m1, m2 in product (M1.domains.get_all (), M2.domains.get_all ()): # p = 0.0 # for yh in YH.domains.get_all (): # p = train.get_conditional_prob (Event ({YH: yh}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s})) \ # * sol[get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt}))] # if p < p_min: # p_min = p # if p > p_max: # p_max = p # loc = get_loc (Event ({A1: a1, A2: a2, N: n, S: s, YT: yt})) # sol[2 ** 8 + loc] = p_max # sol[2 ** 8 + 2 ** 8 // 8 + loc] = p_min # # np.dot (G_matrix_2, sol) # check () # solver solvers.options['show_progress'] = False sol = solvers.lp (c=matrix (C_vector), G=matrix (G_matrix), h=matrix (h), A=matrix (A_matrix), b=matrix (b), solver=solvers ) mapping = np.array (sol['x']) # build the post-processing result in training and testing train.df.loc[:, name + 'M'] = train.df[name] test.df[name + 'M'] = test.df[name] for a1, a2, n, m1, m2, s, yh, yt in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), M1.domains.get_all (), M2.domains.get_all (), S.domains.get_all (), YH.domains.get_all (), YT.domains.get_all ()): if yh.name != yt.name: p = mapping[get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt})), 0] train.random_assign (Event ({YH: yh, A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}), Event ({YT: yt}), p) test.random_assign (Event ({YH: yh, A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}), Event ({YT: yt}), p) train.df[name] = train.df[name + 'M'] train.df.drop ([name + 'M'], axis=1) test.df[name] = test.df[name + 'M'] test.df.drop ([name + 'M'], axis=1) acc.append (accuracy_score (train.df[name], train.df[Y.name])) acc.append (accuracy_score (test.df[name], test.df[Y.name])) acc_matrix.iloc[:, 3] = acc train.df.to_csv ('temp/adult_binary_train_prediction3.csv', index=False) test.df.to_csv ('temp/adult_binary_test_prediction3.csv', index=False)
def method3(acc_matrix): df_train = pd.read_csv('temp/synthetic_train_prediction0.csv') train = DataSet(df_train) df_test = pd.read_csv('temp/synthetic_test_prediction0.csv') test = DataSet(df_test) acc = [] for name in ['LR', 'SVM']: probabilistic_cbn = load_xml_to_cbn( os.path.join(src_path, '../data/synthetic/ProbabilisticBayesianModel.xml')) def find_condition_prob(e, t): return probabilistic_cbn.find_prob(e, t) def get_loc(e): return probabilistic_cbn.get_loc(e) A = probabilistic_cbn.v['A'] S = probabilistic_cbn.v['S'] N = probabilistic_cbn.v['N'] M = probabilistic_cbn.v['M'] Y = probabilistic_cbn.v['Y'] YH = Variable(name='YH', index=Y.index + 1, domains=Y.domains) probabilistic_cbn.v[(YH.index, YH.name)] = YH YT = Variable(name='YT', index=Y.index + 2, domains=Y.domains) probabilistic_cbn.v[(YT.index, YT.name)] = YT # build linear loss function C_vector = np.zeros((2**6 + 2**6 // 2, 1)) for a, n, m, s in product(A.domains.get_all(), N.domains.get_all(), M.domains.get_all(), S.domains.get_all()): p_x_s = train.get_marginal_prob( Event({ 'A': a, 'M': m, 'N': n, 'S': s })) p_yh_1_y = p_x_s * train.count( Event({ 'Y': 0, name: 0 }), Event({ 'A': a, 'M': m, 'N': n, 'S': s }), 'notequal') loc = get_loc(Event({A: a, M: m, N: n, S: s, YH: 0, YT: 0})) C_vector[loc] = p_yh_1_y * train.get_conditional_prob( Event({name: 0}), Event({ 'A': a, 'M': m, 'N': n, 'S': s })) loc = get_loc(Event({A: a, M: m, N: n, S: s, YH: 1, YT: 1})) C_vector[loc] = p_yh_1_y * train.get_conditional_prob( Event({name: 1}), Event({ 'A': a, 'M': m, 'N': n, 'S': s })) p_yh__y = p_x_s * train.count( Event({ 'Y': 0, name: 0 }), Event({ 'A': a, 'M': m, 'N': n, 'S': s }), 'equal') loc = get_loc(Event({A: a, M: m, N: n, S: s, YH: 0, YT: 1})) C_vector[loc] = p_yh__y * train.get_conditional_prob( Event({name: 0}), Event({ 'A': a, 'M': m, 'N': n, 'S': s })) loc = get_loc(Event({A: a, M: m, N: n, S: s, YH: 1, YT: 0})) C_vector[loc] = p_yh__y * train.get_conditional_prob( Event({name: 1}), Event({ 'A': a, 'M': m, 'N': n, 'S': s })) # the inequality of max and min G_matrix_1 = np.zeros((2**6, 2**6 + 2**6 // 2)) h_1 = np.zeros(2**6) # max i = 0 for a, n, s, yt in product(A.domains.get_all(), N.domains.get_all(), S.domains.get_all(), YT.domains.get_all()): for m in M.domains.get_all(): for yh in YH.domains.get_all(): loc = get_loc( Event({ A: a, M: m, N: n, S: s, YH: yh, YT: yt })) G_matrix_1[i, loc] = train.get_conditional_prob( Event({name: yh}), Event({ 'A': a, 'M': m, 'N': n, 'S': s })) loc = get_loc(Event({A: a, N: n, S: s, YT: yt})) G_matrix_1[i, 2**6 + loc] = -1 i += 1 # min assert i == 2**6 // 2 for a, n, s, yt in product(A.domains.get_all(), N.domains.get_all(), S.domains.get_all(), YT.domains.get_all()): for m in M.domains.get_all(): for yh in YH.domains.get_all(): loc = get_loc( Event({ A: a, M: m, N: n, S: s, YH: yh, YT: yt })) G_matrix_1[i, loc] = -train.get_conditional_prob( Event({name: yh}), Event({ 'A': a, 'M': m, 'N': n, 'S': s })) loc = get_loc(Event({A: a, N: n, S: s, YT: yt})) G_matrix_1[i, 2**6 + 2**6 // 4 + loc] = 1 i += 1 # build counterfactual fairness constraints G_matrix_2 = np.zeros((2**2 * 2, 2**6 + 2**6 // 2)) h_2 = np.ones(2**2 * 2) * tau i = 0 for a, m in product(A.domains.get_all(), M.domains.get_all()): for n in N.domains.get_all(): loc = get_loc(Event({A: a, N: n, S: spos, YT: yt_pos})) G_matrix_2[i, 2**6 + loc] = find_condition_prob( Event({N: n}), Event({ A: a, S: spos })) for yh in YH.domains.get_all(): loc = get_loc( Event({ A: a, M: m, N: n, S: sneg, YH: yh, YT: yt_pos })) G_matrix_2[i, loc] = -find_condition_prob (Event ({N: n}), Event ({A: a, S: sneg})) \ * train.get_conditional_prob (Event ({name: yh}), Event ({'A': a, 'M': m, 'N': n, 'S': sneg})) i += 1 assert i == 2**2 for a, m in product(A.domains.get_all(), M.domains.get_all()): for n in N.domains.get_all(): loc = get_loc(Event({A: a, N: n, S: spos, YT: yt_pos})) G_matrix_2[i, 2**6 + 2**6 // 4 + loc] = -find_condition_prob( Event({N: n}), Event({ A: a, S: spos })) for yh in YH.domains.get_all(): loc = get_loc( Event({ A: a, M: m, N: n, S: sneg, YH: yh, YT: yt_pos })) G_matrix_2[i, loc] = find_condition_prob (Event ({N: n}), Event ({A: a, S: sneg})) \ * train.get_conditional_prob (Event ({name: yh}), Event ({'A': a, 'M': m, 'N': n, 'S': sneg})) i += 1 ########### # mapping in [0, 1] G_matrix_3 = np.zeros(((2**6 + 2**6 // 2) * 2, 2**6 + 2**6 // 2)) h_3 = np.zeros((2**6 + 2**6 // 2) * 2) for i in range(2**6 + 2**6 // 2): G_matrix_3[i, i] = 1 h_3[i] = 1 G_matrix_3[2**6 + 2**6 // 2 + i, i] = -1 h_3[2**6 + 2**6 // 2 + i] = 0 # sum = 1 A_matrix = np.zeros((2**6 // 2, 2**6 + 2**6 // 2)) b = np.ones(2**6 // 2) i = 0 for a, n, m, s, yh in product(A.domains.get_all(), N.domains.get_all(), M.domains.get_all(), S.domains.get_all(), YH.domains.get_all()): for yt in YT.domains.get_all(): A_matrix[ i, get_loc(Event({ A: a, M: m, N: n, S: s, YH: yh, YT: yt }))] = 1 i += 1 assert i == 2**6 // 2 # combine the inequality constraints G_matrix = np.vstack([G_matrix_1, G_matrix_2, G_matrix_3]) h = np.hstack([h_1, h_2, h_3]) # solver solvers.options['show_progress'] = False sol = solvers.lp(c=matrix(C_vector), G=matrix(G_matrix), h=matrix(h), A=matrix(A_matrix), b=matrix(b), solver=solvers) mapping = np.array(sol['x']) # build the post-processing result in training and testing train.df[name + '1'] = train.df[name] test.df[name + '1'] = test.df[name] for a, n, m, s, yh, yt in product(A.domains.get_all(), N.domains.get_all(), M.domains.get_all(), S.domains.get_all(), YH.domains.get_all(), YT.domains.get_all()): if yh.name != yt.name: p = mapping[ get_loc(Event({ A: a, M: m, N: n, S: s, YH: yh, YT: yt })), 0] train.random_assign( Event({ name: yh, 'A': a, 'M': m, 'N': n, 'S': s }), Event({name + '1': yt}), p) test.random_assign( Event({ name: yh, 'A': a, 'M': m, 'N': n, 'S': s }), Event({name + '1': yt}), p) train.df[name] = train.df[name + '1'] train.df.drop([name + '1'], axis=1) test.df[name] = test.df[name + '1'] test.df.drop([name + '1'], axis=1) acc.append(accuracy_score(train.df['Y'], train.df[name])) acc.append(accuracy_score(test.df['Y'], test.df[name])) acc_matrix.iloc[:, 3] = acc train.df.to_csv('temp/synthetic_train_prediction3.csv', index=False) test.df.to_csv('temp/synthetic_test_prediction3.csv', index=False)