Ejemplo n.º 1
0
def ml_test():
    print("test1")
    args = app.parse_args()
    global counter
    counter = 0
    frame = pd.read_csv(
        csv_gen.generate(
            filename,
            rows=rows,
            headers=["Age", "GPA", "HoursPW", "EducationLvl", "Employed"],
            datatypes=["int120", "float5", "int56", "int6", "int2"]))
    headers = ["Age", "GPA", "HoursPW", "EducationLvl"]
    params = Parameters(args)
    sensitive_attr = "Employed"
    stream = CASTLE(handler, headers, sensitive_attr, params)

    for (_, row) in frame.iterrows():
        counter += 1
        stream.insert(row)

    # A function which tells us if there are any tuples which haven't been outputted yet

    avg = mlu.average_group(sarray)
    assert type(avg) is pd.DataFrame
    avg_features = avg[["Age", "HoursPW", "EducationLvl", "GPA"]]
    avg_norm = (avg_features - avg_features.mean()) / (avg_features.std())
    total = 0
    for i in range(1, 10):
        valid = validation(avg_norm, avg[sensitive_attr], i)
        total += valid
    print("Average Accuracy for CASTLE: {}".format(total / 9))
    os.remove("{}.csv".format(filename))
Ejemplo n.º 2
0
def main():
    # Parse the regular arguments for the program
    args = app.parse_args()
    print("args: {}".format(args))

    # Set a random seed
    seed = np.random.randint(1e6)
    np.random.seed(seed)
    print("seed: {}".format(seed))

    # Generate some parameters
    params = generate_parameters(args)
    print("params: {}".format(params))

    # Read the file contents
    frame = pd.read_csv(args.filename).sample(args.sample_size)

    headers = list(frame.columns.values)[1:-1]
    print("headers: {}".format(headers))
    sensitive_attr = headers[-1]
    print("sensitive_attr: {}".format(sensitive_attr))
    stream = CASTLE(handler, headers, sensitive_attr, params)

    try:
        for (_, row) in tqdm(frame.iterrows()):
            stream.insert(row)
    except Exception as e:
        traceback.print_exc()
        print(seed, params, args.filename, headers, sensitive_attr)
        sys.exit(1)
Ejemplo n.º 3
0
def test_k(file_name, k_list):
    frame = pd.read_csv(file_name)
    headers = list(frame.columns.values)[1:-1]

    avg_loss_list = []

    for k in k_list:
        params = Parameters()

        params.k = k
        params.delta = 10
        params.beta = 10
        params.mu = 10

        stream = CASTLE(handler, headers, "FareAmount", params)

        for (_, row) in frame.iterrows():
            stream.insert(row)

        clusters = stream.big_gamma

        cum_loss = 0
        for cluster in clusters:
            cum_loss += cluster.information_loss(stream.global_ranges)
        avg_loss = cum_loss / len(clusters)

        avg_loss_list.append(avg_loss)

    plot_average_loss_1D(avg_loss_list, k_list, "k")
Ejemplo n.º 4
0
def jitter_wrapper(params, frame):
    headers = ["PickupLocationID", "TripDistance"]

    stream = CASTLE(handler, headers, params)

    for (_, row) in frame.iterrows():

        latency_dict[row["pid"]] = time.time()
        stream.insert(row)

    jitter = 0
    for i in range(len(latency_list) - 1):
        jitter += abs(latency_list[i] - latency_list[i + 1])
    jitter /= len(latency_list) - 1

    return jitter
Ejemplo n.º 5
0
def main():
    args = app.parse_args()
    print("args: {}".format(args))

    seed = args.seed if args.seed else np.random.randint(1e6)
    np.random.seed(seed)
    print("USING RANDOM SEED: {}".format(seed))

    frame = pd.read_csv(args.filename).sample(args.sample_size)

    headers = ["PickupLocationID", "TripDistance"]
    params = Parameters(args)
    sensitive_attr = "FareAmount"

    stream = CASTLE(handler, headers, sensitive_attr, params)

    for (_, row) in frame.iterrows():
        stream.insert(row)

    if args.display:
        display_visualisation(stream)
Ejemplo n.º 6
0
def test_beta_mu(file_name, beta_list, mu_list):
    frame = pd.read_csv(file_name)
    headers = list(frame.columns.values)[1:-1]

    info_loss = []

    for mu in mu_list:
        print("mu: {}".format(mu))

        avg_loss_list = []

        for beta in beta_list:
            print("beta: {}".format(beta))
            params = Parameters()

            params.k = 10
            params.delta = 200
            params.beta = beta
            params.mu = mu
            params.l = 1
            params.dp = False

            stream = CASTLE(handler, headers, "FareAmount", params)

            for (_, row) in frame.iterrows():
                stream.insert(row)

            clusters = stream.big_gamma

            cum_loss = 0
            for cluster in clusters:
                cum_loss += cluster.information_loss(stream.global_ranges)
            avg_loss = cum_loss / len(clusters)
            avg_loss_list.append(avg_loss)

        info_loss.append(np.array(avg_loss_list))

    X, Y = np.meshgrid(beta_list, mu_list)
    plot_average_loss_2D(np.array(info_loss), X, "Beta", Y, "Mu")
Ejemplo n.º 7
0
def main():
    args = app.parse_args()

    frame = pd.read_csv("diabetes.csv")
    headers = [
        "pregnancies", "glucose", "bloodPressure", "skinThickness", "insulin",
        "bmi", "diabetesPedigree", "age"
    ]
    sensitive_attr = "outcome"
    X_train, X_test, Y_train, Y_test = train_test_split(frame[headers],
                                                        frame[sensitive_attr],
                                                        test_size=0.3)
    print("Normal Data")
    NN(X_train, X_test, Y_train, Y_test)
    args.k = 7
    args.l = 1
    args.delta = 100
    args.mu = 100
    args.beta = 25
    Phi = [1, 10, 100, 1000]
    Big_Beta = [0.25, 0.5, 0.75, 1]
    acc_list = []
    for args.phi in Phi:
        print("Phi: {}".format(args.phi))
        avg_acc_list = []
        for args.big_beta in Big_Beta:
            print("Big Beta: {}".format(args.big_beta))
            train = X_train
            train[sensitive_attr] = Y_train
            train['pid'] = train.index
            global sarray
            sarray = []
            params = Parameters(args)
            stream = CASTLE(handler, headers, sensitive_attr, params)
            print("CASTLE START")
            counter = 0
            for (_, row) in train.iterrows():
                counter += 1
                stream.insert(row)
            while (counter <= args.delta):
                counter += 1
                stream.cycle()
            print("CASTLE END")
            grped = mlu.average_group(sarray)
            acc = NN(grped[headers], X_test, grped[sensitive_attr], Y_test)
            avg_acc_list.append(acc)
        acc_list.append(np.array(avg_acc_list))
    print(acc_list)
    X, Y = np.meshgrid(Big_Beta, np.log(Phi))
    fig = plt.figure()
    ax = plt.axes(projection='3d')
    ax.plot_surface(X,
                    Y,
                    np.array(acc_list),
                    rstride=1,
                    cstride=1,
                    cmap='winter',
                    edgecolor='none')
    ax.set_xlabel("Big Beta")
    ax.set_ylabel("Log(Phi)")
    ax.set_zlabel('AUC-ROC')
    plt.show()
Ejemplo n.º 8
0
def main():
    args = app.parse_args()

    frame = pd.read_csv("diabetes.csv")
    headers = [
        "pregnancies", "glucose", "bloodPressure", "skinThickness", "insulin",
        "bmi", "diabetesPedigree", "age"
    ]
    extended_headers = [
        "spcpregnancies", "minpregnancies", "maxpregnancies", "spcglucose",
        "minglucose", "maxglucose", "spcbloodPressure", "minbloodPressure",
        "maxbloodPressure", "spcskinThickness", "minskinThickness",
        "maxskinThickness", "spcinsulin", "mininsulin", "maxinsulin", "spcbmi",
        "minbmi", "maxbmi", "spcdiabetesPedigree", "mindiabetesPedigree",
        "maxdiabetesPedigree", "spcage", "minage", "maxage"
    ]
    sensitive_attr = "outcome"
    total = 0
    for i in ks:
        valid = validation(frame, frame[sensitive_attr], i)
        print("K={} Accuracy: {}%".format(i, round(valid * 100), 5))
        total += valid
    print("Average Accuracy for Pre-CASTLE: {}%".format(
        round((total / 9) * 100, 5)))

    frame["pid"] = frame.index
    args.k = 7
    args.l = 1
    args.delta = 100
    args.mu = 100
    args.beta = 25
    Phi = [1, 10, 100, 1000]
    Big_Beta = [0.25, 0.5, 0.75, 1]
    acc_list = []
    for args.phi in Phi:
        print("Phi: {}".format(args.phi))
        avg_acc_list = []
        for args.big_beta in Big_Beta:
            print("Big Beta: {}".format(args.big_beta))
            average = 0
            for looping in range(0, 10):
                frame = pd.read_csv("diabetes.csv")
                frame["pid"] = frame.index

                global sarray
                sarray = []
                params = Parameters(args)
                stream = CASTLE(handler, headers, sensitive_attr, params)

                for (_, row) in frame.iterrows():
                    stream.insert(row)

                dataframes = []
                for s in sarray:
                    df = s.to_frame().transpose()
                    dataframes.append(df)
                avg = pd.concat(dataframes, ignore_index=True, sort=True)
                avg_features = avg[extended_headers]
                total = 0
                for i in ks:
                    valid = validation(avg_features, avg[sensitive_attr], i)
                    # print("K={} Accuracy: {}%".format(i, round(valid*100), 5))
                    total += valid
                print("Accuracy: {}%".format((total / 9) * 100))
                average += (total / 9)
            print("Average Accuracy: {}%".format((average / 10) * 100))
            avg_acc_list.append(average / 10)
        acc_list.append(np.array(avg_acc_list))

    X, Y = np.meshgrid(Big_Beta, np.log(Phi))
    fig = plt.figure()
    ax = plt.axes(projection='3d')
    ax.plot_surface(X,
                    Y,
                    np.array(acc_list),
                    rstride=1,
                    cstride=1,
                    cmap='winter',
                    edgecolor='none')
    ax.set_xlabel("Big Beta")
    ax.set_ylabel("Log(Phi)")
    ax.set_zlabel('Average KNN Accuracy')
    plt.show()
Ejemplo n.º 9
0
def main():
    args = app.parse_args()
    print("Loading in data")
    frame = pd.read_csv("adult.csv")
    cat = {
        "workclass": [
            "Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov",
            "Local-gov", "State-gov", "Without-pay", "Never-worked", "?"
        ],
        "maritalstatus": [
            'Married-civ-spouse', "Divorced", "Never-married", "Separated",
            "Widowed", "Married-spouse-absent", "Married-AF-spouse", "?"
        ],
        "occupation": [
            "Tech-support", "Craft-repair", "Other-service", "Sales",
            "Exec-managerial", "Prof-specialty", "Handlers-cleaners",
            "Machine-op-inspct", "Adm-clerical", "Farming-fishing",
            "Transport-moving", "Priv-house-serv", "Protective-serv",
            "Armed-Forces", "?"
        ],
        "relationship": [
            "Wife", "Own-child", "Husband", "Not-in-family", "Other-relative",
            "Unmarried", "?"
        ],
        "race": [
            "White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other",
            "Black", "?"
        ],
        "sex": ["Male", "Female", "?"],
        "nativecountry": [
            "United-States", "Cambodia", "England", "Puerto-Rico", "Canada",
            "Germany", "Outlying-US(Guam-USVI-etc)", "India", "Japan",
            "Greece", "South", "China", "Cuba", "Iran", "Honduras",
            "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico",
            "Portugal", "Ireland", "France", "Dominican-Republic", "Laos",
            "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala",
            "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador",
            "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands", "?"
        ],
        "salary": [">50K", "<=50K"]
    }
    frame["pid"] = frame.index
    headers = [
        "age", "workclass", "fnlwgt", "maritalstatus", "educationnum",
        "occupation", "relationship", "race", "sex", "nativecountry",
        "capitalgain", "capitalloss", "hoursperweek"
    ]
    extended_headers = [
        "spcage", "minage", "maxage", "spcworkclass", "minworkclass",
        "maxworkclass", "spcfnlwgt", "minfnlwgt", "maxfnlwgt",
        "spcmaritalstatus", "minmaritalstatus", "maxmaritalstatus",
        "spceducationnum", "mineducationnum", "maxeducationnum",
        "spcoccupation", "minoccupation", "maxoccupation", "spcrelationship",
        "minrelationship", "maxrelationship", "spcrace", "minrace", "maxrace",
        "spcsex", "minsex", "maxsex", "spcnativecountry", "minnativecountry",
        "maxnativecountry", "spccapitalgain", "mincapitalgain",
        "maxcapitalgain", "spccapitalloss", "mincapitalloss", "maxcapitalloss",
        "spchoursperweek", "minhoursperweek", "maxhoursperweek"
    ]
    sensitive_attr = "salary"
    total = 0
    data = frame
    print("Processing Data")
    processed = mlu.process(data, cat)
    print("Processed Data")
    processed[sensitive_attr] = processed[sensitive_attr].astype('int')
    for i in ks:
        valid = validation(processed[headers], processed[sensitive_attr], i)
        print("K={} Accuracy: {}%".format(i, round(valid * 100), 5))
        total += valid
    print("Average Accuracy for Pre-CASTLE: {}%".format(
        round((total / len(ks)) * 100, 5)))

    frame["pid"] = frame.index
    args.k = 1000
    args.l = 1
    args.delta = 10000
    args.mu = 100
    args.beta = 50
    Phi = [1, 10, 100, 1000]
    Big_Beta = [0.35, 0.5, 0.75, 1]
    acc_list = []
    print("Size: {}".format(frame.shape))
    print("Starting Loop")
    for args.phi in Phi:
        print("Phi: {}".format(args.phi))
        avg_acc_list = []
        for args.big_beta in Big_Beta:
            print("Big Beta: {}".format(args.big_beta))
            average = 0
            for loop in range(0, 10):
                frame = pd.read_csv("adult.csv")
                print("Processing Data")
                processed = mlu.process(frame, cat)
                print("Processed Data")
                processed[sensitive_attr] = processed[sensitive_attr].astype(
                    'int')
                processed["pid"] = processed.index
                global sarray
                sarray = []
                params = Parameters(args)
                stream = CASTLE(handler, headers, sensitive_attr, params)
                print("Starting CASTLE")
                counter = 0
                for (_, row) in processed.iterrows():
                    counter += 1
                    stream.insert(row)
                while (counter <= args.delta):
                    print("Cycling")
                    counter += 1
                    stream.cycle()
                print("Finished CASTLE")
                print(len(sarray))
                dataframes = []
                for s in sarray:
                    df = s.to_frame().transpose()
                    dataframes.append(df)
                avg = pd.concat(dataframes, ignore_index=True, sort=True)
                avg_features = avg[extended_headers]
                total = 0
                avg[sensitive_attr] = avg[sensitive_attr].astype('int')
                for i in ks:
                    valid = validation(avg_features, avg[sensitive_attr], i)
                    # print("K={} Accuracy: {}%".format(i, round(valid*100), 5))
                    total += valid
                average += (total / 9)
                print("Phi: {}, BBeta: {}, Average Accuracy: {}%".format(
                    args.phi, args.big_beta, round((total / 9) * 100), 5))
            avg_acc_list.append(average / 10)
            print("Overall Average: {}%".format((average / 10) * 100))
        acc_list.append(np.array(avg_acc_list))

    X, Y = np.meshgrid(Big_Beta, np.log(Phi))
    fig = plt.figure()
    ax = plt.axes(projection='3d')
    ax.plot_surface(X,
                    Y,
                    np.array(acc_list),
                    rstride=1,
                    cstride=1,
                    cmap='winter',
                    edgecolor='none')
    ax.set_xlabel("Big Beta")
    ax.set_ylabel("Log(Phi)")
    ax.set_zlabel('Average Accuracy of KNN for Predicting Salary')
    plt.savefig("OrigData.png")
Ejemplo n.º 10
0
def main():
    args = app.parse_args()
    frame = pd.read_csv("fifa19.csv")[[
        "Age", "Nationality", "Wage", "Value", "Potential", "Club", "Position",
        "Overall"
    ]]
    headers = [
        "Age", "Nationality", "Potential", "Wage", "Value", "Club", "Position"
    ]
    sensitive_attr = "Overall"

    frame['pid'] = frame.index

    cat = {
        "Club": frame.Club.unique().tolist(),
        "Nationality": frame.Nationality.unique().tolist(),
        "Position": frame.Position.unique().tolist(),
    }
    processed = mlu.process(frame, cat)
    from_short_num(processed, ["Wage", "Value"])

    X = normalise(processed[headers])
    Y = processed[sensitive_attr]
    Y = Y.astype('int')

    print("Pre-CASTLE Test Accuracy: {}%".format(round(NN(X, Y) * 100, 5)))

    print("Pre-CASTLE KNN")

    total = 0
    for i in range(1, 10):
        valid = validation(X, Y, i)
        print("K={} Accuracy: {}%".format(i, round(valid * 100), 5))
        total += valid
    print("Average Accuracy for Pre-CASTLE: {}%".format(
        round((total / 9) * 100, 5)))

    params = Parameters(args)
    stream = CASTLE(handler, headers, sensitive_attr, params)
    for (_, row) in processed.iterrows():
        stream.insert(row)
    avg = mlu.average_group(sarray, [("pid", np.int64), ("Overall", np.int64),
                                     ("Club", np.int64),
                                     ("Nationality", np.int64),
                                     ("Position", np.int64)])

    X = normalise(avg[headers])
    Y = avg[sensitive_attr]
    Y = Y.astype('int')

    print("Post-CASTLE Test Accuracy: {}%".format(round(NN(X, Y) * 100, 5)))

    print("Post-CASTLE KNN")

    total = 0
    for i in range(1, 10):
        valid = validation(X, Y, i)
        print("K={} Accuracy: {}%".format(i, round(valid * 100), 5))
        total += valid
    print("Average Accuracy for Post-CASTLE: {}%".format(
        round((total / 9) * 100, 5)))