Ejemplo n.º 1
0
def check_coint(stock, stock1, stock2):
    formate_time = 150  # 建模時間長度
    trade_time = 100  # 回測時間長度
    for j in range(1):  # 一天建模??次
        day1_1 = stock.iloc[(trade_time * j):(formate_time +
                                              (trade_time * j)), :]
        day1_1.index = np.arange(0, len(day1_1), 1)
    #print(day1_1)
    unitroot_stock = ADF.adf.drop_stationary(ADF.adf(day1_1))
    if unitroot_stock.shape[1] < 2:
        return 0
    x = find_pairs(stock1, stock2, unitroot_stock)
    #print(x)
    x = list(x)
    if len(x[0]) != 0:
        total = abs(x[0][0][0]) + abs(x[0][0][1])
        weight1_std = x[0][0][0] / total
        weight2_std = x[0][0][1] / total
        spread = weight1_std * unitroot_stock.iloc[:,
                                                   0] + weight2_std * unitroot_stock.iloc[:,
                                                                                          1]
        ave = []
        std = []
        ske = []
        for i in range(1):
            y = spread
            # 有時間趨勢項的模型必須分開計算
            if x[2][0] == 'model4':
                x = np.arange(0, len(y))
                b1, b0 = np.polyfit(x, y, 1)
                trend_line = x * b1 + b0
                y = y - trend_line
                # 將spread消除趨勢項後,計算mu與std
                ave.append(np.mean(y))
                std.append(np.std(y))
                ske.append(skew(y))
            else:
                ave.append(np.mean(y))
                std.append(np.std(y))
                ske.append(skew(y))
        x.extend([ave, std, ske])
        c = {
            "stock1": x[1][0][0],
            "stock2": x[1][0][1],
            "weight1": weight1_std,
            "weight2": weight2_std,
            "model": x[2],
            "p_val": x[3],
            "mean": x[4],
            "std": x[5],
            "skewness": x[6]
        }
        data = pd.DataFrame(c)  #将字典转换成为数据框
        #print(data)
        return data

    else:
        return 0
Ejemplo n.º 2
0
    fin_cursor.execute(query)
    result = fin_cursor.fetchall()
    fin_db.commit()
    df = pd.DataFrame(list(result))
    df = df.pivot(index='mtimestamp', columns='code', values='avg_price')
    df = df.fillna(method='ffill')
    df = df.fillna(method='backfill')
    df.index = np.arange(0, len(df), 1)
    df = df.drop(index=np.arange(0, 16, 1))
    df.index = np.arange(0, len(df), 1)

    day1_1 = df.iloc[(trade_time * j):(formate_time + (trade_time * j)), :]
    day1_1.index = np.arange(0, len(day1_1), 1)

    print(day1_1)
    unitroot_stock = ADF.adf.drop_stationary(ADF.adf(day1_1))

    a = accelerate_formation.pairs_trading(unitroot_stock, flag)
    table = accelerate_formation.pairs_trading.formation_period(a)

    print(table)

    stock1_name = table.stock1.astype('str', copy=False)
    stock2_name = table.stock2.astype('str', copy=False)
    test_stock1 = np.array(df[stock1_name].T)
    test_stock2 = np.array(df[stock2_name].T)

    mean = np.zeros(len(table))
    std = np.zeros(len(table))
    for i in range(len(table)):
        spread_m, spread = spread_mean(test_stock1, test_stock2, i, table)
Ejemplo n.º 3
0
            print("save tick min data ......... done.")



            query = "SELECT * FROM Fintech.Stock_30s_Price_Tick where DateTime >= '"+ choose_date +" 09:00' and DateTime <= '"+ choose_date +" 13:30';"
            fin_cursor.execute(query)
            result = fin_cursor.fetchall()
            fin_db.commit()
            halfmin = pd.DataFrame(list(result))
            halfmin = halfmin.drop(columns=["index","DateTime"])
            halfmin = halfmin[corp].astype('float')
            halfmin.to_csv(half_path.format(year)+"{}_half_min.csv".format(choose_date.replace("-","")), index=False)
            print("save half min data ......... done.")


            unitroot_stock = ADF.adf.drop_stationary(ADF.adf(avg_min))
            
            try:
                a = accelerate_formation.pairs_trading(unitroot_stock,flag)
                table = accelerate_formation.pairs_trading.formation_period( a ) 
                if table.empty:
                    continue   
                stock1_name = table.stock1.astype('str',copy=False)
                stock2_name = table.stock2.astype('str',copy=False)
                test_stock1 = np.array(df[stock1_name].T)
                test_stock2 = np.array(df[stock2_name].T)
                
                mean = np.zeros(len(table))
                std = np.zeros(len(table))
                for i in range(len(table)):
                    spread_m,spread = spread_mean(test_stock1,test_stock2,i,table)
Ejemplo n.º 4
0
Archivo: try.py Proyecto: q40603/Demo
def trade_all_pairs(capital, maxi, open_time, stop_loss_time, tax_cost):

    f_50 = open('50_corp.txt', encoding='utf-8')
    top_50 = [i.split(",")[0] for i in f_50]
    header_name = ["mtimestamp", "code", "price", "vol", "acu_vol"]
    print(top_50)
    dirs = os.listdir(base)
    dirs = ''.join(dirs)
    all_date = re.findall(r"\d{8,8}", dirs)
    for date in all_date:
        print(date)
        if (int(date) < 20181027):
            continue

        data = pd.read_csv("".join([base, date, "_Match.txt"]),
                           header=None,
                           usecols=[0, 1, 3, 4, 5],
                           names=header_name,
                           sep=',',
                           dtype={
                               'mtimestamp': 'object',
                               'code': 'object'
                           },
                           encoding="utf-8",
                           engine="python",
                           error_bad_lines=False)
        data = data[data['code'].isin(top_50)]

        data["mtimestamp"] = data["mtimestamp"].apply(
            lambda x: datetime.strptime(" ".join([date, x[0:4]]), '%Y%m%d %H%M'
                                        ))
        #print(data)
        tick_data = data.copy()

        data = data.groupby([
            "mtimestamp", "code"
        ]).apply(lambda x: (x['price'] * x['vol']).sum() / x["vol"].sum())
        data = data.unstack().resample('T').ffill().bfill()
        min_data = data.copy()

        day1 = data.reset_index()

        # print(day1)
        day1.index = np.arange(0, len(day1), 1)
        # print(day1)
        day1_1 = day1.iloc[0:149, :]
        day1_1 = day1_1.drop(columns=['mtimestamp'])
        print(day1_1)
        # print(df)
        day1_1.index = np.arange(0, len(day1_1), 1)

        unitroot_stock = ADF.adf.drop_stationary(
            ADF.adf(day1_1.select_dtypes(exclude=['object'])))
        a = accelerate_formation.pairs_trading(unitroot_stock)
        table = accelerate_formation.pairs_trading.formation_period(a)
        if table.empty:
            continue
            # print(table)
        print(table)
        # tracking_list = pd.concat([table.stock1, table.stock2])
        # tracking_list = pd.unique(tracking_list)
        # print(tracking_list)

        #========================================== back test ==============================================

        #query = "select left(stime, 16) as mtimestamp, code , price from s_price_tick where stime >= '"+ choose_date +" 11:29' and stime <= '"+ choose_date +" 13:25' GROUP BY code, mtimestamp;"
        # fin_cursor.execute(query)
        # result = fin_cursor.fetchall()
        # fin_db.commit()
        # df = pd.DataFrame(list(result))
        # df = df.pivot(index='mtimestamp', columns='code', values='price')
        # df = df.fillna(method='ffill')
        # tick_data = df.fillna(method='backfill')
        # tick_data.index = np.arange(0,len(tick_data),1)
        last_time = datetime.strptime(date + ' 1325', '%Y%m%d %H%M')
        start_time = datetime.strptime(date + ' 1129', '%Y%m%d %H%M')

        tick_data = tick_data.drop(["vol", "acu_vol"], axis=1)

        tick_data = tick_data.groupby(["code", "mtimestamp"]).tail(1)
        #print(tick_data)
        tick_data = tick_data.pivot(index='mtimestamp',
                                    columns='code',
                                    values='price')
        tick_data = tick_data.ffill().bfill()
        print(tick_data)
        tick_data = tick_data.iloc[150:265]
        #tick_data = tick_data[(tick_data["mtimestamp"]>=start_time & tick_data["mtimestamp"]<=last_time)]
        print(tick_data)
        tick_data.index = np.arange(0, len(tick_data), 1)

        # query = "select left(stime, 16) as mtimestamp, code , sum(volume * price)/sum(volume) as avg_price from s_price_tick where stime > '"+ choose_date +" 11:30' and stime <= '"+ choose_date +" 13:25' GROUP BY code, mtimestamp;"
        # fin_cursor.execute(query)
        # result = fin_cursor.fetchall()
        # fin_db.commit()
        # df = pd.DataFrame(list(result))
        # df = df.pivot(index='mtimestamp', columns='code', values='avg_price')
        # df = df.fillna(method='ffill')
        # start_time = datetime.strptime(date+' 1130','%Y%m%d %H%M')
        # min_data = min_data[(min_data["mtimestamp"]>start_time & min_data["mtimestamp"]<=last_time)]
        # min_data.index = np.arange(0,len(min_data),1)
        # print(min_data)
        min_data = day1.iloc[150:265]
        min_data.index = np.arange(0, len(min_data), 1)
        formate_time = 150

        # capital = 3000           # 每組配對資金300萬
        # maxi = 5                 # 股票最大持有張數
        # open_time = 1.5                 # 開倉門檻倍數
        # stop_loss_time = 10                  # 停損門檻倍數
        # tax_cost = 0
        l_table = len(table.index)
        for i in range(l_table):
            print(i)
            y = table.iloc[i, :]
            print(y)
            tmp = pairs(i, formate_time, y, min_data, tick_data, open_time,
                        stop_loss_time, day1, maxi, tax_cost, capital)

            print(tmp)
Ejemplo n.º 5
0
Archivo: try.py Proyecto: q40603/Demo
def trade_certain_pairs(choose_date, capital, maxi, open_time, stop_loss_time,
                        tax_cost, pair_list):

    query = "select left(stime, 16) as mtimestamp, code , sum(volume * price)/sum(volume) as avg_price from s_price_tick where stime >= '" + choose_date + "' and stime <= '" + choose_date + " 13:25' GROUP BY code, mtimestamp;"
    print(query)
    fin_cursor.execute(query)
    result = fin_cursor.fetchall()
    fin_db.commit()
    df = pd.DataFrame(list(result))
    df = df.pivot(index='mtimestamp', columns='code', values='avg_price')
    df = df.fillna(method='ffill')
    day1 = df.fillna(method='backfill')
    day1 = day1.reset_index()
    # print(day1)
    day1.index = np.arange(0, len(day1), 1)
    # print(day1)
    day1_1 = day1.iloc[0:149, :]
    # print(df)
    day1_1.index = np.arange(0, len(day1_1), 1)
    # print(len(day1_1.index))

    # print(df)

    query = "select distinct f_date from pairs where f_date = '" + choose_date + "';"
    fin_cursor.execute(query)
    result = list(fin_cursor.fetchall())
    fin_db.commit()
    if (not len(result)):
        unitroot_stock = ADF.adf.drop_stationary(
            ADF.adf(day1_1.select_dtypes(exclude=['object'])))
        a = accelerate_formation.pairs_trading(unitroot_stock)
        table = accelerate_formation.pairs_trading.formation_period(a)
        for i, j in table.iterrows():
            sql = "INSERT INTO pairs (stock1, stock2, w1, w2, snr, zcr, mu, stdev, f_date ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s);"
            try:
                fin_cursor.execute(
                    sql, (str(j["stock1"]), str(j["stock2"]), str(
                        j["w1"]), str(j["w2"]), str(j["snr"]), str(j["zcr"]),
                          str(j["mu"]), str(j["stdev"]), str(choose_date)))
            except Exception as e:
                print(e, j)
        fin_db.commit()
        # print(table)
    else:
        print(datetime.now().strftime("%Y-%b-%d"))
        query = "select * from pairs where f_date = '" + choose_date + "';"
        fin_cursor.execute(query)
        result = fin_cursor.fetchall()
        fin_db.commit()
        table = pd.DataFrame(list(result))
        table.index = np.arange(0, len(table), 1)
        # print(table)
    print(table)
    # tracking_list = pd.concat([table.stock1, table.stock2])
    # tracking_list = pd.unique(tracking_list)
    # print(tracking_list)

    #========================================== back test ==============================================

    query = "select left(stime, 16) as mtimestamp, code , price from s_price_tick where stime >= '" + choose_date + " 11:29' and stime <= '" + choose_date + " 13:25' GROUP BY code, mtimestamp;"
    fin_cursor.execute(query)
    result = fin_cursor.fetchall()
    fin_db.commit()
    df = pd.DataFrame(list(result))
    df = df.pivot(index='mtimestamp', columns='code', values='price')
    df = df.fillna(method='ffill')
    tick_data = df.fillna(method='backfill')
    tick_data.index = np.arange(0, len(tick_data), 1)
    print(tick_data)

    query = "select left(stime, 16) as mtimestamp, code , sum(volume * price)/sum(volume) as avg_price from s_price_tick where stime > '" + choose_date + " 11:30' and stime <= '" + choose_date + " 13:25' GROUP BY code, mtimestamp;"
    fin_cursor.execute(query)
    result = fin_cursor.fetchall()
    fin_db.commit()
    df = pd.DataFrame(list(result))
    df = df.pivot(index='mtimestamp', columns='code', values='avg_price')
    df = df.fillna(method='ffill')
    min_data = df.fillna(method='backfill')
    min_data.index = np.arange(0, len(min_data), 1)
    print(min_data)

    formate_time = 150

    # capital = 3000           # 每組配對資金300萬
    # maxi = 5                 # 股票最大持有張數
    # open_time = 1.5                 # 開倉門檻倍數
    # stop_loss_time = 10                  # 停損門檻倍數
    # tax_cost = 0
    l_table = len(table.index)
    for i in range(l_table):
        print(i)
        y = table.iloc[i, :]
        for j in pair_list:
            if (j[0] == y.stock1
                    and j[1] == y.stock2) or (j[0] == y.stock2
                                              and j[1] == y.stock1):
                print(y)
                tmp = pairs(i, formate_time, y, min_data, tick_data, open_time,
                            stop_loss_time, day1, maxi, tax_cost, capital)
                print(tmp)
                break
Ejemplo n.º 6
0
def time_cost_comparison(num_experiment_round,
                         benchmark,
                         X,
                         protected_attribs,
                         constraint,
                         model,
                         record_step=100,
                         record_frequency=100,
                         g_num=1000,
                         l_num=1000,
                         decay=0.5,
                         c_num=4,
                         max_iter=10,
                         s_g=1.0,
                         s_l=1.0,
                         epsilon_l=1e-6,
                         fashion='RoundRobin'):
    # compare the time consumption for generating a certain number of non-duplicate individual discriminatory instances

    time_cost = np.zeros([3, record_frequency])

    for i in range(num_experiment_round):
        round_now = i + 1
        print('--- ROUND', round_now, '---')
        if len(X) <= g_num:
            seeds = X.copy()
        else:
            clustered_data = generation_utilities.clustering(X, c_num)
            seeds = np.empty(shape=(0, len(X[0])))
            for i in range(g_num):
                new_seed = generation_utilities.get_seed(clustered_data,
                                                         len(X),
                                                         c_num,
                                                         i % c_num,
                                                         fashion=fashion)
                seeds = np.append(seeds, [new_seed], axis=0)

        t_ADF = ADF.time_record(X, seeds, protected_attribs, constraint, model,
                                g_num, l_num, record_step, record_frequency,
                                max_iter, s_g, s_l, epsilon_l)
        t_EIDIG_5 = EIDIG.time_record(X, seeds, protected_attribs, constraint,
                                      model, decay, g_num, l_num, record_step,
                                      record_frequency, 5, max_iter, s_g, s_l,
                                      epsilon_l)
        t_EIDIG_INF = EIDIG.time_record(X, seeds, protected_attribs,
                                        constraint, model, decay, g_num, l_num,
                                        record_step, record_frequency,
                                        l_num + 1, max_iter, s_g, s_l,
                                        epsilon_l)
        time_cost[0] += t_ADF
        time_cost[1] += t_EIDIG_5
        time_cost[2] += t_EIDIG_INF

    avg_time_cost = time_cost / num_experiment_round
    np.save(
        'logging_data/logging_data_from_tests/time_cost_comparison/' +
        benchmark + '_time_every{}ids_ADF.npy'.format(record_step),
        avg_time_cost[0])
    np.save(
        'logging_data/logging_data_from_tests/time_cost_comparison/' +
        benchmark + '_time_every{}ids_EIDIG_5.npy'.format(record_step),
        avg_time_cost[1])
    np.save(
        'logging_data/logging_data_from_tests/time_cost_comparison/' +
        benchmark + '_time_every{}ids_EIDIG_INF.npy'.format(record_step),
        avg_time_cost[2])

    print('Results averaged on', num_experiment_round,
          'rounds have been saved. Results on the first 10 records:')
    print('ADF:')
    print('Time cost:', avg_time_cost[0, :10])
    print('EIDIG-5:')
    print('Time cost:', avg_time_cost[1, :10])
    print('EIDIG-INF:')
    print('Time cost:', avg_time_cost[2, :10])
Ejemplo n.º 7
0
def seedwise_comparison(num_experiment_round,
                        benchmark,
                        X,
                        protected_attribs,
                        constraint,
                        model,
                        g_num=100,
                        l_num=100,
                        c_num=4,
                        max_iter=10,
                        s_g=1.0,
                        s_l=1.0,
                        epsilon_l=1e-6,
                        fashion='RoundRobin'):
    # compare the number of non-duplicate individual discriminatory instances generated in a seedwise fashion

    num_gen = np.zeros([3, g_num])
    num_ids = np.zeros([3, g_num])

    for i in range(num_experiment_round):
        round_now = i + 1
        print('--- ROUND', round_now, '---')
        clustered_data = generation_utilities.clustering(X, c_num)
        seeds = np.empty(shape=(0, len(X[0])))
        for i in range(g_num):
            new_seed = generation_utilities.get_seed(clustered_data,
                                                     len(X),
                                                     c_num,
                                                     i % c_num,
                                                     fashion=fashion)
            seeds = np.append(seeds, [new_seed], axis=0)

        gen_ADF, ids_ADF = ADF.seedwise_generation(X, seeds, protected_attribs,
                                                   constraint, model, l_num,
                                                   max_iter, s_g, s_l,
                                                   epsilon_l)
        gen_EIDIG_5, ids_EIDIG_5 = EIDIG.seedwise_generation(
            X, seeds, protected_attribs, constraint, model, l_num, 0.5, 5,
            max_iter, s_g, s_l, epsilon_l)
        gen_EIDIG_INF, ids_EIDIG_INF = EIDIG.seedwise_generation(
            X, seeds, protected_attribs, constraint, model, l_num, 0.5,
            l_num + 1, max_iter, s_g, s_l, epsilon_l)
        num_gen[0] += gen_ADF
        num_ids[0] += ids_ADF
        num_gen[1] += gen_EIDIG_5
        num_ids[1] += ids_EIDIG_5
        num_gen[2] += gen_EIDIG_INF
        num_ids[2] += ids_EIDIG_INF

    avg_num_gen = num_gen / num_experiment_round
    avg_num_ids = num_ids / num_experiment_round
    np.save(
        'logging_data/logging_data_from_tests/seedwise_comparison/' +
        benchmark + '_num_gen_ADF.npy', num_gen[0])
    np.save(
        'logging_data/logging_data_from_tests/seedwise_comparison/' +
        benchmark + '_num_ids_ADF.npy', num_ids[0])
    np.save(
        'logging_data/logging_data_from_tests/seedwise_comparison/' +
        benchmark + '_num_gen_EIDIG_5.npy', num_gen[1])
    np.save(
        'logging_data/logging_data_from_tests/seedwise_comparison/' +
        benchmark + '_num_ids_EIDIG_5.npy', num_ids[1])
    np.save(
        'logging_data/logging_data_from_tests/seedwise_comparison/' +
        benchmark + '_num_gen_EIDIG_INF.npy', num_gen[2])
    np.save(
        'logging_data/logging_data_from_tests/seedwise_comparison/' +
        benchmark + '_num_ids_EIDIG_INF.npy', num_ids[2])

    print('Results averaged on', num_experiment_round,
          'rounds have been saved. Results on the first 10 seeds:')
    print('ADF:')
    print('Number of generated instances:', num_gen[0, :10])
    print('Number of generated discriminatory instances:', num_ids[0, :10])
    print('EIDIG-5:')
    print('Number of generated instances:', num_gen[1, :10])
    print('Number of generated discriminatory instances:', num_ids[1, :10])
    print('EIDIG-INF:')
    print('Number of generated instances:', num_gen[2, :10])
    print('Number of generated discriminatory instances:', num_ids[2, :10])
Ejemplo n.º 8
0
def comparison(num_experiment_round,
               benchmark,
               X,
               protected_attribs,
               constraint,
               model,
               g_num=1000,
               l_num=1000,
               decay=0.5,
               c_num=4,
               max_iter=10,
               s_g=1.0,
               s_l=1.0,
               epsilon_l=1e-6,
               fashion='RoundRobin'):
    # compare EIDIG with ADF in terms of effectiveness and efficiency

    num_ids = np.array([0] * 3)
    time_cost = np.array([0] * 3)

    for i in range(num_experiment_round):
        round_now = i + 1
        print('--- ROUND', round_now, '---')
        if g_num >= len(X):
            seeds = X.copy()
        else:
            clustered_data = generation_utilities.clustering(X, c_num)
            seeds = np.empty(shape=(0, len(X[0])))
            for i in range(g_num):
                new_seed = generation_utilities.get_seed(clustered_data,
                                                         len(X),
                                                         c_num,
                                                         i % c_num,
                                                         fashion=fashion)
                seeds = np.append(seeds, [new_seed], axis=0)

        t1 = time.time()
        ids_ADF, gen_ADF, total_iter_ADF = ADF.individual_discrimination_generation(
            X, seeds, protected_attribs, constraint, model, g_num, l_num,
            max_iter, s_g, s_l, epsilon_l)
        np.save(
            'logging_data/logging_data_from_tests/complete_comparison/' +
            benchmark + '_ids_ADF_' + str(round_now) + '.npy', ids_ADF)
        t2 = time.time()
        print('ADF:', 'In', total_iter_ADF, 'search iterations', len(gen_ADF),
              'non-duplicate instances are explored', len(ids_ADF),
              'of which are discriminatory. Time cost:', t2 - t1, 's.')
        num_ids[0] += len(ids_ADF)
        time_cost[0] += t2 - t1

        t1 = time.time()
        ids_EIDIG_5, gen_EIDIG_5, total_iter_EIDIG_5 = EIDIG.individual_discrimination_generation(
            X, seeds, protected_attribs, constraint, model, decay, g_num,
            l_num, 5, max_iter, s_g, s_l, epsilon_l)
        np.save(
            'logging_data/logging_data_from_tests/complete_comparison/' +
            benchmark + '_ids_EIDIG_5_' + str(round_now) + '.npy', ids_EIDIG_5)
        t2 = time.time()
        print('EIDIG-5:', 'In', total_iter_EIDIG_5, 'search iterations',
              len(gen_EIDIG_5), 'non-duplicate instances are explored',
              len(ids_EIDIG_5), 'of which are discriminatory. Time cost:',
              t2 - t1, 's.')
        num_ids[1] += len(ids_EIDIG_5)
        time_cost[1] += t2 - t1

        t1 = time.time()
        ids_EIDIG_INF, gen_EIDIG_INF, total_iter_EIDIG_INF = EIDIG.individual_discrimination_generation(
            X, seeds, protected_attribs, constraint, model, decay, g_num,
            l_num, l_num + 1, max_iter, s_g, s_l, epsilon_l)
        np.save(
            'logging_data/logging_data_from_tests/complete_comparison/' +
            benchmark + '_ids_EIDIG_INF_' + str(round_now) + '.npy',
            ids_EIDIG_INF)
        t2 = time.time()
        print('EIDIG-INF:', 'In', total_iter_EIDIG_INF, 'search iterations',
              len(gen_EIDIG_INF), 'non-duplicate instances are explored',
              len(ids_EIDIG_INF), 'of which are discriminatory. Time cost:',
              t2 - t1, 's.')
        num_ids[2] += len(ids_EIDIG_INF)
        time_cost[2] += t2 - t1

        print('\n')

    avg_num_ids = num_ids / num_experiment_round
    avg_speed = num_ids / time_cost
    print('Results of complete comparison on', benchmark,
          'with g_num set to {} and l_num set to {}'.format(g_num, l_num),
          ',averaged on', num_experiment_round, 'rounds:')
    for index, approach in [(0, 'ADF'), (1, 'EIDIG-5'), (2, 'EIDIG-INF')]:
        print(
            approach, ':', avg_num_ids[index],
            'individual discriminatory instances are generated at a speed of',
            avg_speed[index], 'per second.')
Ejemplo n.º 9
0
def local_comparison(num_experiment_round,
                     benchmark,
                     X,
                     protected_attribs,
                     constraint,
                     model,
                     update_interval_list,
                     num_seeds=100,
                     l_num=1000,
                     c_num=4,
                     s_l=1.0,
                     epsilon=1e-6):
    # compare the local phase given the same individual discriminatory instances set

    num_ids = np.array([0] * (len(update_interval_list) + 1))
    time_cost = np.array([0] * (len(update_interval_list) + 1))

    for i in range(num_experiment_round):
        round_now = i + 1
        print('--- ROUND', round_now, '---')
        num_attribs = len(X[0])
        clustered_data = generation_utilities.clustering(X, c_num)
        id_seeds = np.empty(shape=(0, num_attribs))
        for i in range(100000000):
            x_seed = generation_utilities.get_seed(clustered_data,
                                                   len(X),
                                                   c_num,
                                                   i % c_num,
                                                   fashion='RoundRobin')
            similar_x_seed = generation_utilities.similar_set(
                x_seed, num_attribs, protected_attribs, constraint)
            if generation_utilities.is_discriminatory(x_seed, similar_x_seed,
                                                      model):
                id_seeds = np.append(id_seeds, [x_seed], axis=0)
                if len(id_seeds) >= num_seeds:
                    break

        t1 = time.time()
        ids_ADF, _, total_iter_ADF = ADF.local_generation(
            num_attribs, l_num, id_seeds.copy(), protected_attribs, constraint,
            model, s_l, epsilon)
        t2 = time.time()
        num_ids_ADF = len(ids_ADF)
        print(
            'ADF:', 'In', total_iter_ADF, 'search iterations,', num_ids_ADF,
            'non-duplicate individual discriminatory instances are generated. Time cost:',
            t2 - t1, 's.')
        num_ids[0] += num_ids_ADF
        time_cost[0] += t2 - t1

        for index, update_interval in enumerate(update_interval_list):
            print('Update interval set to {}:'.format(update_interval))
            t1 = time.time()
            ids_EIDIG, _, total_iter_EIDIG = EIDIG.local_generation(
                num_attribs, l_num, id_seeds.copy(), protected_attribs,
                constraint, model, update_interval, s_l, epsilon)
            t2 = time.time()
            num_ids_EIDIG = len(ids_EIDIG)
            print(
                'EIDIG:', 'In', total_iter_EIDIG, 'search iterations,',
                num_ids_EIDIG,
                'non-duplicate individual discriminatory instances are generated. Time cost:',
                t2 - t1, 's.')
            num_ids[index + 1] += num_ids_EIDIG
            time_cost[index + 1] += t2 - t1

        print('\n')

    avg_num_ids = num_ids / num_experiment_round
    avg_speed = num_ids / time_cost
    print(
        'Results of local phase comparsion on', benchmark,
        'with l_num set to {} given {} discriminatory seeds'.format(
            l_num, num_seeds), ',averaged on', num_experiment_round, 'rounds:')
    print('ADF:', avg_num_ids[0],
          'individual discriminatory instances are generated at a speed of',
          avg_speed[0], 'per second.')
    for index, update_interval in enumerate(update_interval_list):
        print('Update interval set to {}:'.format(update_interval))
        print(
            'EIDIG:', avg_num_ids[index + 1],
            'individual discriminatory instances are generated at a speed of',
            avg_speed[index + 1], 'per second.')

    return num_ids, time_cost
Ejemplo n.º 10
0
def global_comparison(num_experiment_round,
                      benchmark,
                      X,
                      protected_attribs,
                      constraint,
                      model,
                      decay_list,
                      num_seeds=1000,
                      c_num=4,
                      max_iter=10,
                      s_g=1.0):
    # compare the global phase given the same set of seeds

    num_ids = np.array([0] * (len(decay_list) + 1))
    num_iter = np.array([0] * (len(decay_list) + 1))
    time_cost = np.array([0] * (len(decay_list) + 1))

    for i in range(num_experiment_round):
        round_now = i + 1
        print('--- ROUND', round_now, '---')
        num_attribs = len(X[0])
        num_dis = 0
        if num_seeds >= len(X):
            seeds = X
        else:
            clustered_data = generation_utilities.clustering(X, c_num)
            seeds = np.empty(shape=(0, num_attribs))
            for i in range(num_seeds):
                x_seed = generation_utilities.get_seed(clustered_data,
                                                       len(X),
                                                       c_num,
                                                       i % c_num,
                                                       fashion='Distribution')
                seeds = np.append(seeds, [x_seed], axis=0)
        for seed in seeds:
            similar_seed = generation_utilities.similar_set(
                seed, num_attribs, protected_attribs, constraint)
            if generation_utilities.is_discriminatory(seed, similar_seed,
                                                      model):
                num_dis += 1
        print('Given', num_seeds,
              '(no more than 600 for german credit) seeds,', num_dis,
              'of which are individual discriminatory instances.')

        t1 = time.time()
        ids_ADF, _, total_iter_ADF = ADF.global_generation(
            X, seeds, num_attribs, num_seeds, protected_attribs, constraint,
            model, max_iter, s_g)
        t2 = time.time()
        num_ids_ADF = len(ids_ADF)
        print(
            'ADF:', 'In', total_iter_ADF, 'search iterations,', num_ids_ADF,
            'non-duplicate individual discriminatory instances are generated. Time cost:',
            t2 - t1, 's.')
        num_ids[0] += num_ids_ADF
        num_iter[0] += total_iter_ADF
        time_cost[0] += t2 - t1

        for index, decay in enumerate(decay_list):
            print('Decay factor set to {}:'.format(decay))
            t1 = time.time()
            ids_EIDIG, _, total_iter_EIDIG = EIDIG.global_generation(
                X, seeds, num_attribs, num_seeds, protected_attribs,
                constraint, model, decay, max_iter, s_g)
            t2 = time.time()
            num_ids_EIDIG = len(ids_EIDIG)
            print(
                'EIDIG:', 'In', total_iter_EIDIG, 'search iterations,',
                num_ids_EIDIG,
                'non-duplicate individual discriminatory instances are generated. Time cost:',
                t2 - t1, 's.')
            num_ids[index + 1] += num_ids_EIDIG
            num_iter[index + 1] += total_iter_EIDIG
            time_cost[index + 1] += t2 - t1

        print('\n')

    avg_num_ids = num_ids / num_experiment_round
    avg_speed = num_ids / time_cost
    avg_iter = num_iter / num_experiment_round / num_seeds
    print('Results of global phase comparsion on', benchmark,
          'given {} seeds'.format(num_seeds), ',averaged on',
          num_experiment_round, 'rounds:')
    print('ADF:', avg_num_ids[0],
          'individual discriminatory instances are generated at a speed of',
          avg_speed[0],
          'per second, and the number of iterations on a singe seed is',
          avg_iter[0], '.')
    for index, decay in enumerate(decay_list):
        print('Decay factor set to {}:'.format(decay))
        print(
            'EIDIG:', avg_num_ids[index + 1],
            'individual discriminatory instances are generated at a speed of',
            avg_speed[index + 1],
            'per second, and the number of iterations on a singe seed is',
            avg_iter[index + 1], '.')

    return num_ids, num_iter, time_cost