Esempio n. 1
0
def seedwise_generation(X, seeds, protected_attribs, constraint, model, l_num, max_iter=10, s_g=1.0, s_l=1.0, epsilon=1e-6):
    # perform global generation and local generation successively on each single seed

    num_seeds = len(seeds)
    num_gen = np.array([0] * num_seeds)
    num_ids = np.array([0] * num_seeds)
    num_attribs = len(X[0])
    ids = np.empty(shape=(0, num_attribs))
    all_gen = np.empty(shape=(0, num_attribs))
    direction_l = [-1, 1]
    for index, instance in enumerate(seeds):
        x1 = instance.copy()
        flag = False
        for _ in range(max_iter):
            similar_x1 = generation_utilities.similar_set(x1, num_attribs, protected_attribs, constraint)
            if generation_utilities.is_discriminatory(x1, similar_x1, model):
                ids = np.append(ids, [x1], axis=0)
                flag = True
                break
            x2 = generation_utilities.max_diff(x1, similar_x1, model)
            grad1 = compute_grad(x1, model)
            grad2 = compute_grad(x2, model)
            direction_g = np.zeros_like(X[0])
            sign_grad1 = np.sign(grad1)
            sign_grad2 = np.sign(grad2)
            for attrib in range(num_attribs):
                if attrib not in protected_attribs and sign_grad1[attrib] == sign_grad2[attrib]:
                    direction_g[attrib] = sign_grad1[attrib]
            x1 = x1 + s_g * direction_g
            x1 = generation_utilities.clip(x1, constraint)
            all_gen = np.append(all_gen, [x1], axis=0)
        if flag == True:
            x0 = x1.copy()
            for _ in range(l_num):
                similar_x1 = generation_utilities.similar_set(x1, num_attribs, protected_attribs, constraint)
                x2 = generation_utilities.find_pair(x1, similar_x1, model)
                grad1 = compute_grad(x1, model)
                grad2 = compute_grad(x2, model)
                p = generation_utilities.normalization(grad1, grad2, protected_attribs, epsilon)
                a = generation_utilities.random_pick(p)
                s = generation_utilities.random_pick([0.5, 0.5])
                x1[a] = x1[a] + direction_l[s] * s_l
                x1 = generation_utilities.clip(x1, constraint)
                all_gen = np.append(all_gen, [x1], axis=0)
                similar_x1 = generation_utilities.similar_set(x1, num_attribs, protected_attribs, constraint)
                if generation_utilities.is_discriminatory(x1, similar_x1, model):
                    ids = np.append(ids, [x1], axis=0)
                else:
                    x1 = x0.copy()
        nondup_ids = np.array(list(set([tuple(id) for id in ids])))
        nondup_gen = np.array(list(set([tuple(gen) for gen in all_gen])))
        num_gen[index] = len(nondup_gen)
        num_ids[index] = len(nondup_ids)
    return num_gen, num_ids
Esempio n. 2
0
def local_generation(num_attribs, l_num, g_id, protected_attribs, constraint, model, s_l, epsilon):
    # local generation phase of ADF

    direction = [-1, 1]
    l_id = np.empty(shape=(0, num_attribs))
    all_gen_l = np.empty(shape=(0, num_attribs))
    try_times = 0
    for x1 in g_id:
        x0 = x1.copy()
        for _ in range(l_num):
            try_times += 1
            similar_x1 = generation_utilities.similar_set(x1, num_attribs, protected_attribs, constraint)
            x2 = generation_utilities.find_pair(x1, similar_x1, model)
            grad1 = compute_grad(x1, model)
            grad2 = compute_grad(x2, model)
            p = generation_utilities.normalization(grad1, grad2, protected_attribs, epsilon)
            a = generation_utilities.random_pick(p)
            s = generation_utilities.random_pick([0.5, 0.5])
            x1[a] = x1[a] + direction[s] * s_l
            x1 = generation_utilities.clip(x1, constraint)
            all_gen_l = np.append(all_gen_l, [x1], axis=0)
            similar_x1 = generation_utilities.similar_set(x1, num_attribs, protected_attribs, constraint)
            if generation_utilities.is_discriminatory(x1, similar_x1, model):
                l_id = np.append(l_id, [x1], axis=0)
            else:
                x1 = x0.copy()
    l_id = np.array(list(set([tuple(id) for id in l_id])))
    return l_id, all_gen_l, try_times
Esempio n. 3
0
def global_generation(X, seeds, num_attribs, g_num, protected_attribs, constraint, model, max_iter, s_g):
    # global generation phase of ADF

    g_id = np.empty(shape=(0, num_attribs))
    all_gen_g = np.empty(shape=(0, num_attribs))
    try_times = 0
    for i in range(g_num):
        x1 = seeds[i].copy()
        for _ in range(max_iter):
            try_times += 1
            similar_x1 = generation_utilities.similar_set(x1, num_attribs, protected_attribs, constraint)
            if generation_utilities.is_discriminatory(x1, similar_x1, model):
                g_id = np.append(g_id, [x1], axis=0)
                break
            x2 = generation_utilities.max_diff(x1, similar_x1, model)
            grad1 = compute_grad(x1, model)
            grad2 = compute_grad(x2, model)
            direction = np.zeros_like(X[0])
            sign_grad1 = np.sign(grad1)
            sign_grad2 = np.sign(grad2)
            for attrib in range(num_attribs):
                if attrib not in protected_attribs and sign_grad1[attrib] == sign_grad2[attrib]:
                    direction[attrib] = sign_grad1[attrib]
            x1 = x1 + s_g * direction
            x1 = generation_utilities.clip(x1, constraint)
            all_gen_g = np.append(all_gen_g, [x1], axis=0)
    g_id = np.array(list(set([tuple(id) for id in g_id])))
    return g_id, all_gen_g, try_times
Esempio n. 4
0
def time_record(X, seeds, protected_attribs, constraint, model, g_num, l_num, record_step, record_frequency, max_iter=10, s_g=1.0, s_l=1.0, epsilon=1e-6):
    # record time consumption
    
    t1 = time.time()
    num_attribs = len(X[0])
    t = np.array([0.0] * record_frequency)
    direction_l = [-1, 1]
    threshold = record_step
    index = 0
    ids = np.empty(shape=(0, num_attribs))
    num_ids = num_ids_before = 0
    for instance in seeds:
        if num_ids >= record_frequency * record_step:
            break
        x1 = instance.copy()
        flag = False
        for i in range(max_iter+1):
            similar_x1 = generation_utilities.similar_set(x1, num_attribs, protected_attribs, constraint)
            if generation_utilities.is_discriminatory(x1, similar_x1, model):
                ids = np.append(ids, [x1], axis=0)
                flag = True
                break
            if i == max_iter:
                break
            x2 = generation_utilities.max_diff(x1, similar_x1, model)
            grad1 = compute_grad(x1, model)
            grad2 = compute_grad(x2, model)
            direction_g = np.zeros_like(X[0])
            sign_grad1 = np.sign(grad1)
            sign_grad2 = np.sign(grad2)
            for attrib in range(num_attribs):
                if attrib not in protected_attribs and sign_grad1[attrib] == sign_grad2[attrib]:
                    direction_g[attrib] = sign_grad1[attrib]
            x1 = x1 + s_g * direction_g
            x1 = generation_utilities.clip(x1, constraint)
            t2 = time.time()
        if flag == True:
            ids = np.array(list(set([tuple(id) for id in ids])))
            num_ids = len(ids)
            if num_ids > num_ids_before:
                num_ids_before = num_ids
                if num_ids == threshold:
                    t[index] = t2 - t1
                    threshold += record_step
                    index += 1
                    if num_ids >= record_frequency * record_step:
                        break
            x0 = x1.copy()
            for _ in range(l_num):
                similar_x1 = generation_utilities.similar_set(x1, num_attribs, protected_attribs, constraint)
                x2 = generation_utilities.find_pair(x1, similar_x1, model)
                grad1 = compute_grad(x1, model)
                grad2 = compute_grad(x2, model)
                p = generation_utilities.normalization(grad1, grad2, protected_attribs, epsilon)
                a = generation_utilities.random_pick(p)
                s = generation_utilities.random_pick([0.5, 0.5])
                x1[a] = x1[a] + direction_l[s] * s_l
                x1 = generation_utilities.clip(x1, constraint)
                t2 = time.time()
                similar_x1 = generation_utilities.similar_set(x1, num_attribs, protected_attribs, constraint)
                if generation_utilities.is_discriminatory(x1, similar_x1, model):
                    ids = np.append(ids, [x1], axis=0)
                    ids = np.array(list(set([tuple(id) for id in ids])))
                    num_ids = len(ids)
                    if num_ids > num_ids_before:
                        num_ids_before = num_ids
                        if num_ids == threshold:
                            t[index] = t2 - t1
                            threshold += record_step
                            index += 1
                            if num_ids >= record_frequency * record_step:
                                break
                else:
                    x1 = x0.copy()
    return t
Esempio n. 5
0
def local_comparison(num_experiment_round,
                     benchmark,
                     X,
                     protected_attribs,
                     constraint,
                     model,
                     update_interval_list,
                     num_seeds=100,
                     l_num=1000,
                     c_num=4,
                     s_l=1.0,
                     epsilon=1e-6):
    # compare the local phase given the same individual discriminatory instances set

    num_ids = np.array([0] * (len(update_interval_list) + 1))
    time_cost = np.array([0] * (len(update_interval_list) + 1))

    for i in range(num_experiment_round):
        round_now = i + 1
        print('--- ROUND', round_now, '---')
        num_attribs = len(X[0])
        clustered_data = generation_utilities.clustering(X, c_num)
        id_seeds = np.empty(shape=(0, num_attribs))
        for i in range(100000000):
            x_seed = generation_utilities.get_seed(clustered_data,
                                                   len(X),
                                                   c_num,
                                                   i % c_num,
                                                   fashion='RoundRobin')
            similar_x_seed = generation_utilities.similar_set(
                x_seed, num_attribs, protected_attribs, constraint)
            if generation_utilities.is_discriminatory(x_seed, similar_x_seed,
                                                      model):
                id_seeds = np.append(id_seeds, [x_seed], axis=0)
                if len(id_seeds) >= num_seeds:
                    break

        t1 = time.time()
        ids_ADF, _, total_iter_ADF = ADF.local_generation(
            num_attribs, l_num, id_seeds.copy(), protected_attribs, constraint,
            model, s_l, epsilon)
        t2 = time.time()
        num_ids_ADF = len(ids_ADF)
        print(
            'ADF:', 'In', total_iter_ADF, 'search iterations,', num_ids_ADF,
            'non-duplicate individual discriminatory instances are generated. Time cost:',
            t2 - t1, 's.')
        num_ids[0] += num_ids_ADF
        time_cost[0] += t2 - t1

        for index, update_interval in enumerate(update_interval_list):
            print('Update interval set to {}:'.format(update_interval))
            t1 = time.time()
            ids_EIDIG, _, total_iter_EIDIG = EIDIG.local_generation(
                num_attribs, l_num, id_seeds.copy(), protected_attribs,
                constraint, model, update_interval, s_l, epsilon)
            t2 = time.time()
            num_ids_EIDIG = len(ids_EIDIG)
            print(
                'EIDIG:', 'In', total_iter_EIDIG, 'search iterations,',
                num_ids_EIDIG,
                'non-duplicate individual discriminatory instances are generated. Time cost:',
                t2 - t1, 's.')
            num_ids[index + 1] += num_ids_EIDIG
            time_cost[index + 1] += t2 - t1

        print('\n')

    avg_num_ids = num_ids / num_experiment_round
    avg_speed = num_ids / time_cost
    print(
        'Results of local phase comparsion on', benchmark,
        'with l_num set to {} given {} discriminatory seeds'.format(
            l_num, num_seeds), ',averaged on', num_experiment_round, 'rounds:')
    print('ADF:', avg_num_ids[0],
          'individual discriminatory instances are generated at a speed of',
          avg_speed[0], 'per second.')
    for index, update_interval in enumerate(update_interval_list):
        print('Update interval set to {}:'.format(update_interval))
        print(
            'EIDIG:', avg_num_ids[index + 1],
            'individual discriminatory instances are generated at a speed of',
            avg_speed[index + 1], 'per second.')

    return num_ids, time_cost
Esempio n. 6
0
def global_comparison(num_experiment_round,
                      benchmark,
                      X,
                      protected_attribs,
                      constraint,
                      model,
                      decay_list,
                      num_seeds=1000,
                      c_num=4,
                      max_iter=10,
                      s_g=1.0):
    # compare the global phase given the same set of seeds

    num_ids = np.array([0] * (len(decay_list) + 1))
    num_iter = np.array([0] * (len(decay_list) + 1))
    time_cost = np.array([0] * (len(decay_list) + 1))

    for i in range(num_experiment_round):
        round_now = i + 1
        print('--- ROUND', round_now, '---')
        num_attribs = len(X[0])
        num_dis = 0
        if num_seeds >= len(X):
            seeds = X
        else:
            clustered_data = generation_utilities.clustering(X, c_num)
            seeds = np.empty(shape=(0, num_attribs))
            for i in range(num_seeds):
                x_seed = generation_utilities.get_seed(clustered_data,
                                                       len(X),
                                                       c_num,
                                                       i % c_num,
                                                       fashion='Distribution')
                seeds = np.append(seeds, [x_seed], axis=0)
        for seed in seeds:
            similar_seed = generation_utilities.similar_set(
                seed, num_attribs, protected_attribs, constraint)
            if generation_utilities.is_discriminatory(seed, similar_seed,
                                                      model):
                num_dis += 1
        print('Given', num_seeds,
              '(no more than 600 for german credit) seeds,', num_dis,
              'of which are individual discriminatory instances.')

        t1 = time.time()
        ids_ADF, _, total_iter_ADF = ADF.global_generation(
            X, seeds, num_attribs, num_seeds, protected_attribs, constraint,
            model, max_iter, s_g)
        t2 = time.time()
        num_ids_ADF = len(ids_ADF)
        print(
            'ADF:', 'In', total_iter_ADF, 'search iterations,', num_ids_ADF,
            'non-duplicate individual discriminatory instances are generated. Time cost:',
            t2 - t1, 's.')
        num_ids[0] += num_ids_ADF
        num_iter[0] += total_iter_ADF
        time_cost[0] += t2 - t1

        for index, decay in enumerate(decay_list):
            print('Decay factor set to {}:'.format(decay))
            t1 = time.time()
            ids_EIDIG, _, total_iter_EIDIG = EIDIG.global_generation(
                X, seeds, num_attribs, num_seeds, protected_attribs,
                constraint, model, decay, max_iter, s_g)
            t2 = time.time()
            num_ids_EIDIG = len(ids_EIDIG)
            print(
                'EIDIG:', 'In', total_iter_EIDIG, 'search iterations,',
                num_ids_EIDIG,
                'non-duplicate individual discriminatory instances are generated. Time cost:',
                t2 - t1, 's.')
            num_ids[index + 1] += num_ids_EIDIG
            num_iter[index + 1] += total_iter_EIDIG
            time_cost[index + 1] += t2 - t1

        print('\n')

    avg_num_ids = num_ids / num_experiment_round
    avg_speed = num_ids / time_cost
    avg_iter = num_iter / num_experiment_round / num_seeds
    print('Results of global phase comparsion on', benchmark,
          'given {} seeds'.format(num_seeds), ',averaged on',
          num_experiment_round, 'rounds:')
    print('ADF:', avg_num_ids[0],
          'individual discriminatory instances are generated at a speed of',
          avg_speed[0],
          'per second, and the number of iterations on a singe seed is',
          avg_iter[0], '.')
    for index, decay in enumerate(decay_list):
        print('Decay factor set to {}:'.format(decay))
        print(
            'EIDIG:', avg_num_ids[index + 1],
            'individual discriminatory instances are generated at a speed of',
            avg_speed[index + 1],
            'per second, and the number of iterations on a singe seed is',
            avg_iter[index + 1], '.')

    return num_ids, num_iter, time_cost