def main(): # Parse the regular arguments for the program args = app.parse_args() print("args: {}".format(args)) # Set a random seed seed = np.random.randint(1e6) np.random.seed(seed) print("seed: {}".format(seed)) # Generate some parameters params = generate_parameters(args) print("params: {}".format(params)) # Read the file contents frame = pd.read_csv(args.filename).sample(args.sample_size) headers = list(frame.columns.values)[1:-1] print("headers: {}".format(headers)) sensitive_attr = headers[-1] print("sensitive_attr: {}".format(sensitive_attr)) stream = CASTLE(handler, headers, sensitive_attr, params) try: for (_, row) in tqdm(frame.iterrows()): stream.insert(row) except Exception as e: traceback.print_exc() print(seed, params, args.filename, headers, sensitive_attr) sys.exit(1)
def ml_test(): print("test1") args = app.parse_args() global counter counter = 0 frame = pd.read_csv( csv_gen.generate( filename, rows=rows, headers=["Age", "GPA", "HoursPW", "EducationLvl", "Employed"], datatypes=["int120", "float5", "int56", "int6", "int2"])) headers = ["Age", "GPA", "HoursPW", "EducationLvl"] params = Parameters(args) sensitive_attr = "Employed" stream = CASTLE(handler, headers, sensitive_attr, params) for (_, row) in frame.iterrows(): counter += 1 stream.insert(row) # A function which tells us if there are any tuples which haven't been outputted yet avg = mlu.average_group(sarray) assert type(avg) is pd.DataFrame avg_features = avg[["Age", "HoursPW", "EducationLvl", "GPA"]] avg_norm = (avg_features - avg_features.mean()) / (avg_features.std()) total = 0 for i in range(1, 10): valid = validation(avg_norm, avg[sensitive_attr], i) total += valid print("Average Accuracy for CASTLE: {}".format(total / 9)) os.remove("{}.csv".format(filename))
def test_parse_args(): # Do we want the ints to be strings or ints? # For now, it's all gonna be string because slack's formatting? io = [ ("name [1,2,3]", "name", ['1', '2', '3']), ("name_1 [1,2,3]", "name_1", ['1', '2', '3']), ("name-1 [1,2,3]", "name-1", ['1', '2', '3']), ("12 [1,2,3]", None, ['1', '2', '3']), ("name 1 [1,2,3]", "name", None), ("name ]1,2,3]", "name", None), ("name [1,,3]", "name", None), ("name [1,3,]", "name", ['1', '3']), ("name [1, \,, \2]", "name", ['1', ',', '2']), # Trailing comma is fine, the last empty char gets removed ("name [\[, \], \,]", "name", ['[', ']', ',']), ("name []", "name", None), # array cannot be empty ("name [\2, \3]", "name", ['2', '3']), ("name [(Ali\,Shaown),(Shaown\, Ali)]", "name", ['(Ali,Shaown)', '(Shaown, Ali)']), ("name [A B, C D]", "name", ["A B", "C D"]), ("name [A B, C D]", "name", ["A B", "C D"]), # Space chars within the items are preserved ("name [A B, C D]", "name", ["A B", "C D"]), # Space in between the items are NOT ] for (input_string, arg1, arg2) in io: assert parse_args(input_string) == (arg1, arg2)
def main(): args = app.parse_args() print("args: {}".format(args)) seed = args.seed if args.seed else np.random.randint(1e6) np.random.seed(seed) print("USING RANDOM SEED: {}".format(seed)) frame = pd.read_csv(args.filename).sample(args.sample_size) headers = ["PickupLocationID", "TripDistance"] params = Parameters(args) sensitive_attr = "FareAmount" stream = CASTLE(handler, headers, sensitive_attr, params) for (_, row) in frame.iterrows(): stream.insert(row) if args.display: display_visualisation(stream)
headers = ["PickupLocationID", "TripDistance"] stream = CASTLE(handler, headers, params) for (_, row) in frame.iterrows(): latency_dict[row["pid"]] = time.time() stream.insert(row) jitter = 0 for i in range(len(latency_list) - 1): jitter += abs(latency_list[i] - latency_list[i + 1]) jitter /= len(latency_list) - 1 return jitter if __name__ == "__main__": args = app.parse_args() print("args: {}".format(args)) seed = args.seed if args.seed else np.random.randint(1e6) np.random.seed(seed) print("USING RANDOM SEED: {}".format(seed)) frame = pd.read_csv(args.filename).sample(20) params = Parameters(args) jitter = jitter_wrapper(params, frame) print("JITTER: {}s".format(jitter))
def main(): args = app.parse_args() frame = pd.read_csv("diabetes.csv") headers = [ "pregnancies", "glucose", "bloodPressure", "skinThickness", "insulin", "bmi", "diabetesPedigree", "age" ] sensitive_attr = "outcome" X_train, X_test, Y_train, Y_test = train_test_split(frame[headers], frame[sensitive_attr], test_size=0.3) print("Normal Data") NN(X_train, X_test, Y_train, Y_test) args.k = 7 args.l = 1 args.delta = 100 args.mu = 100 args.beta = 25 Phi = [1, 10, 100, 1000] Big_Beta = [0.25, 0.5, 0.75, 1] acc_list = [] for args.phi in Phi: print("Phi: {}".format(args.phi)) avg_acc_list = [] for args.big_beta in Big_Beta: print("Big Beta: {}".format(args.big_beta)) train = X_train train[sensitive_attr] = Y_train train['pid'] = train.index global sarray sarray = [] params = Parameters(args) stream = CASTLE(handler, headers, sensitive_attr, params) print("CASTLE START") counter = 0 for (_, row) in train.iterrows(): counter += 1 stream.insert(row) while (counter <= args.delta): counter += 1 stream.cycle() print("CASTLE END") grped = mlu.average_group(sarray) acc = NN(grped[headers], X_test, grped[sensitive_attr], Y_test) avg_acc_list.append(acc) acc_list.append(np.array(avg_acc_list)) print(acc_list) X, Y = np.meshgrid(Big_Beta, np.log(Phi)) fig = plt.figure() ax = plt.axes(projection='3d') ax.plot_surface(X, Y, np.array(acc_list), rstride=1, cstride=1, cmap='winter', edgecolor='none') ax.set_xlabel("Big Beta") ax.set_ylabel("Log(Phi)") ax.set_zlabel('AUC-ROC') plt.show()
def main(): args = app.parse_args() frame = pd.read_csv("diabetes.csv") headers = [ "pregnancies", "glucose", "bloodPressure", "skinThickness", "insulin", "bmi", "diabetesPedigree", "age" ] extended_headers = [ "spcpregnancies", "minpregnancies", "maxpregnancies", "spcglucose", "minglucose", "maxglucose", "spcbloodPressure", "minbloodPressure", "maxbloodPressure", "spcskinThickness", "minskinThickness", "maxskinThickness", "spcinsulin", "mininsulin", "maxinsulin", "spcbmi", "minbmi", "maxbmi", "spcdiabetesPedigree", "mindiabetesPedigree", "maxdiabetesPedigree", "spcage", "minage", "maxage" ] sensitive_attr = "outcome" total = 0 for i in ks: valid = validation(frame, frame[sensitive_attr], i) print("K={} Accuracy: {}%".format(i, round(valid * 100), 5)) total += valid print("Average Accuracy for Pre-CASTLE: {}%".format( round((total / 9) * 100, 5))) frame["pid"] = frame.index args.k = 7 args.l = 1 args.delta = 100 args.mu = 100 args.beta = 25 Phi = [1, 10, 100, 1000] Big_Beta = [0.25, 0.5, 0.75, 1] acc_list = [] for args.phi in Phi: print("Phi: {}".format(args.phi)) avg_acc_list = [] for args.big_beta in Big_Beta: print("Big Beta: {}".format(args.big_beta)) average = 0 for looping in range(0, 10): frame = pd.read_csv("diabetes.csv") frame["pid"] = frame.index global sarray sarray = [] params = Parameters(args) stream = CASTLE(handler, headers, sensitive_attr, params) for (_, row) in frame.iterrows(): stream.insert(row) dataframes = [] for s in sarray: df = s.to_frame().transpose() dataframes.append(df) avg = pd.concat(dataframes, ignore_index=True, sort=True) avg_features = avg[extended_headers] total = 0 for i in ks: valid = validation(avg_features, avg[sensitive_attr], i) # print("K={} Accuracy: {}%".format(i, round(valid*100), 5)) total += valid print("Accuracy: {}%".format((total / 9) * 100)) average += (total / 9) print("Average Accuracy: {}%".format((average / 10) * 100)) avg_acc_list.append(average / 10) acc_list.append(np.array(avg_acc_list)) X, Y = np.meshgrid(Big_Beta, np.log(Phi)) fig = plt.figure() ax = plt.axes(projection='3d') ax.plot_surface(X, Y, np.array(acc_list), rstride=1, cstride=1, cmap='winter', edgecolor='none') ax.set_xlabel("Big Beta") ax.set_ylabel("Log(Phi)") ax.set_zlabel('Average KNN Accuracy') plt.show()
def main(): args = app.parse_args() print("Loading in data") frame = pd.read_csv("adult.csv") cat = { "workclass": [ "Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", "Local-gov", "State-gov", "Without-pay", "Never-worked", "?" ], "maritalstatus": [ 'Married-civ-spouse', "Divorced", "Never-married", "Separated", "Widowed", "Married-spouse-absent", "Married-AF-spouse", "?" ], "occupation": [ "Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial", "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv", "Armed-Forces", "?" ], "relationship": [ "Wife", "Own-child", "Husband", "Not-in-family", "Other-relative", "Unmarried", "?" ], "race": [ "White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black", "?" ], "sex": ["Male", "Female", "?"], "nativecountry": [ "United-States", "Cambodia", "England", "Puerto-Rico", "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras", "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico", "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands", "?" ], "salary": [">50K", "<=50K"] } frame["pid"] = frame.index headers = [ "age", "workclass", "fnlwgt", "maritalstatus", "educationnum", "occupation", "relationship", "race", "sex", "nativecountry", "capitalgain", "capitalloss", "hoursperweek" ] extended_headers = [ "spcage", "minage", "maxage", "spcworkclass", "minworkclass", "maxworkclass", "spcfnlwgt", "minfnlwgt", "maxfnlwgt", "spcmaritalstatus", "minmaritalstatus", "maxmaritalstatus", "spceducationnum", "mineducationnum", "maxeducationnum", "spcoccupation", "minoccupation", "maxoccupation", "spcrelationship", "minrelationship", "maxrelationship", "spcrace", "minrace", "maxrace", "spcsex", "minsex", "maxsex", "spcnativecountry", "minnativecountry", "maxnativecountry", "spccapitalgain", "mincapitalgain", "maxcapitalgain", "spccapitalloss", "mincapitalloss", "maxcapitalloss", "spchoursperweek", "minhoursperweek", "maxhoursperweek" ] sensitive_attr = "salary" total = 0 data = frame print("Processing Data") processed = mlu.process(data, cat) print("Processed Data") processed[sensitive_attr] = processed[sensitive_attr].astype('int') for i in ks: valid = validation(processed[headers], processed[sensitive_attr], i) print("K={} Accuracy: {}%".format(i, round(valid * 100), 5)) total += valid print("Average Accuracy for Pre-CASTLE: {}%".format( round((total / len(ks)) * 100, 5))) frame["pid"] = frame.index args.k = 1000 args.l = 1 args.delta = 10000 args.mu = 100 args.beta = 50 Phi = [1, 10, 100, 1000] Big_Beta = [0.35, 0.5, 0.75, 1] acc_list = [] print("Size: {}".format(frame.shape)) print("Starting Loop") for args.phi in Phi: print("Phi: {}".format(args.phi)) avg_acc_list = [] for args.big_beta in Big_Beta: print("Big Beta: {}".format(args.big_beta)) average = 0 for loop in range(0, 10): frame = pd.read_csv("adult.csv") print("Processing Data") processed = mlu.process(frame, cat) print("Processed Data") processed[sensitive_attr] = processed[sensitive_attr].astype( 'int') processed["pid"] = processed.index global sarray sarray = [] params = Parameters(args) stream = CASTLE(handler, headers, sensitive_attr, params) print("Starting CASTLE") counter = 0 for (_, row) in processed.iterrows(): counter += 1 stream.insert(row) while (counter <= args.delta): print("Cycling") counter += 1 stream.cycle() print("Finished CASTLE") print(len(sarray)) dataframes = [] for s in sarray: df = s.to_frame().transpose() dataframes.append(df) avg = pd.concat(dataframes, ignore_index=True, sort=True) avg_features = avg[extended_headers] total = 0 avg[sensitive_attr] = avg[sensitive_attr].astype('int') for i in ks: valid = validation(avg_features, avg[sensitive_attr], i) # print("K={} Accuracy: {}%".format(i, round(valid*100), 5)) total += valid average += (total / 9) print("Phi: {}, BBeta: {}, Average Accuracy: {}%".format( args.phi, args.big_beta, round((total / 9) * 100), 5)) avg_acc_list.append(average / 10) print("Overall Average: {}%".format((average / 10) * 100)) acc_list.append(np.array(avg_acc_list)) X, Y = np.meshgrid(Big_Beta, np.log(Phi)) fig = plt.figure() ax = plt.axes(projection='3d') ax.plot_surface(X, Y, np.array(acc_list), rstride=1, cstride=1, cmap='winter', edgecolor='none') ax.set_xlabel("Big Beta") ax.set_ylabel("Log(Phi)") ax.set_zlabel('Average Accuracy of KNN for Predicting Salary') plt.savefig("OrigData.png")
def main(): args = app.parse_args() frame = pd.read_csv("fifa19.csv")[[ "Age", "Nationality", "Wage", "Value", "Potential", "Club", "Position", "Overall" ]] headers = [ "Age", "Nationality", "Potential", "Wage", "Value", "Club", "Position" ] sensitive_attr = "Overall" frame['pid'] = frame.index cat = { "Club": frame.Club.unique().tolist(), "Nationality": frame.Nationality.unique().tolist(), "Position": frame.Position.unique().tolist(), } processed = mlu.process(frame, cat) from_short_num(processed, ["Wage", "Value"]) X = normalise(processed[headers]) Y = processed[sensitive_attr] Y = Y.astype('int') print("Pre-CASTLE Test Accuracy: {}%".format(round(NN(X, Y) * 100, 5))) print("Pre-CASTLE KNN") total = 0 for i in range(1, 10): valid = validation(X, Y, i) print("K={} Accuracy: {}%".format(i, round(valid * 100), 5)) total += valid print("Average Accuracy for Pre-CASTLE: {}%".format( round((total / 9) * 100, 5))) params = Parameters(args) stream = CASTLE(handler, headers, sensitive_attr, params) for (_, row) in processed.iterrows(): stream.insert(row) avg = mlu.average_group(sarray, [("pid", np.int64), ("Overall", np.int64), ("Club", np.int64), ("Nationality", np.int64), ("Position", np.int64)]) X = normalise(avg[headers]) Y = avg[sensitive_attr] Y = Y.astype('int') print("Post-CASTLE Test Accuracy: {}%".format(round(NN(X, Y) * 100, 5))) print("Post-CASTLE KNN") total = 0 for i in range(1, 10): valid = validation(X, Y, i) print("K={} Accuracy: {}%".format(i, round(valid * 100), 5)) total += valid print("Average Accuracy for Post-CASTLE: {}%".format( round((total / 9) * 100, 5)))