def sg_varEx_table(ds, varEx): """ Accepts results containing selected component index data and the name of the dataset the stochastic greedy algorithms selected the features from. Tabulates results and outputs to a Tex file which can be imported to the Research Article. Args: ds (String): Dataset feature selection was performed on. varEx (Dictionary): Key - Algorithm type, Value - Component indexes of the features selected by that algorithm """ header = ['K', 1] rows = [header] x = 2 for k in varEx.keys(): row = [] row.append(k) header.append(x) x = x + 1 for i in range(len(varEx[k])): row.append(varEx[k][i]) rows.append(row) table = Texttable() table.set_cols_align(["c"] * len(rows[0])) table.set_deco(Texttable.HEADER) table.add_rows(rows) outputLatex = latextable.draw_latex(table, caption="Variance explained by variables selected by the stochastic greedy implementations for the {0} dataset and for k = 1,..,6 the kth selected variable is indicated using the default percentage for random sampling".format(ds)) # save output to latex file in output/random/ds/glg{dataset}.tex with open('output/real/{0}/sgVarEx.tex'.format(ds), 'w') as file: file.write(outputLatex) with open('output/notLatex/real/{0}/sgVarEx.txt'.format(ds),'w') as file: file.write(table.draw())
def realDataInfo(): """ This function creates a table containing the dimensions of each of the real datasets used in the research study and outputs to a Tex file. """ header = ['Dataset', 'm', 'v'] file_names = ['X50sites', 'Xpitprops', 'wdbc', 'frogs'] data_names = ['Wave Sites', 'Pitprops', 'Breast Cancer Diagnosis', 'Anuran Frog Calls'] rows = [] rows.append(header) for i in range(len(file_names)): mat = read_matrix_from_file('data/realData/{0}.txt'.format(file_names[i])) m, v = mat.shape dataset = data_names[i] rows.append([dataset, m, v]) table = Texttable() table.set_cols_align(["c"] * 3) table.set_deco(Texttable.HEADER | Texttable.VLINES) table.add_rows(rows) outputLatex = latextable.draw_latex(table, caption="Overview of the real data used in this study") # save output to latex file in output/random/ds/glg{dataset}.tex with open('output/real/data_dimensions.tex','w') as file: file.write(outputLatex) with open('output/notLatex/real/data_dimensions.txt','w') as file: file.write(table.draw())
def print_comparison_latex_code(self): print('\nTexttable Latex') print( latextable.draw_latex( self._table, caption= "Confronto di accuratezza sul test set prima e dopo l'esecuzione " "della strategia di pruning"))
def sg_duration_table(ds, duration): header = ['Algorithm', 'Duration'] rows = [header] for k in duration.keys(): row = [] row.append(k) row.append(str(duration[k][0])) rows.append(row) table = Texttable() table.set_cols_dtype(['t','t']) table.set_cols_align(["c"] * len(rows[0])) table.set_deco(Texttable.HEADER) table.add_rows(rows) outputLatex = latextable.draw_latex(table, caption="Computational time in seconds to select 6 features from the {0} dataset with the greedy and lazy greedy unsupervised stochastic selection algorithms".format(ds)) # save output to latex file in output/random/ds/glg{dataset}.tex with open('output/real/{0}/sgDurationTable.tex'.format(ds), 'w') as file: file.write(outputLatex) with open('output/notLatex/real/{0}/sgDurationTable.txt'.format(ds),'w') as file: file.write(table.draw())
def sg_compare_varEx_table(ds, varEx, percentages): """ Accepts results containing variance explained by selected component data and the name of the dataset the stochastic greedy algorithms selected the features from. Tabulates results and outputs to a Tex file which can be imported to the Research Article. Args: ds (String): Dataset feature selection was performed on. varEx (Dictionary): Key - algorithm type, Value - Dictionary: Key - percentage, Value - The variance explained by components selected by the algorithms given in a list with percentage used in random sampling percentages (List of floats): Percentages values used in random sampling. """ # Table Header header = ['\%'] for i in range(Nc): header.append(i+1) # Table Rows for k in varEx.keys(): rows = [header] for p in percentages: row = [int(p*100)] res = varEx[k][p] for x in res: row.append(x) rows.append(row) # Setup table table = Texttable() table.set_cols_align(["c"]* len(header)) table.set_deco(Texttable.HEADER) table.add_rows(rows) # Create output outputLatex = latextable.draw_latex(table, caption="Variance explained by the variables selected by the {0} algorithm for the {1} dataset, using different random sampling percentages, and for k = 1,..,6 the kth selected variable is indicated".format(k, ds)) # Write to file with open('output/real/{0}/sg_compareVarEx_{1}.tex'.format(ds, k), 'w') as file: file.write(outputLatex) with open('output/notLatex/real/{0}/sg_compareVarEx.txt'.format(ds),'w') as file: file.write(table.draw())
def randomDataInfo(): """ This function creates a table containing the dimensions of each of the random datasets used in the research study and outputs to a Tex file.""" header = ['Dataset', 'm', 'v'] rows = [] rows.append(header) for i in range(10): mat = read_matrix_from_file('data/randomData/t{0}.txt'.format((i+1))) m, v = mat.shape dataset = 't{0}'.format((i+1)) rows.append([dataset, m, v]) table = Texttable() table.set_cols_align(["c"] * 3) table.set_deco(Texttable.HEADER | Texttable.VLINES) table.add_rows(rows) outputLatex = latextable.draw_latex(table, caption="Overview of the random data used in this study") # save output to latex file in output/random/ds/glg{dataset}.tex with open('output/random/data_dimensions.tex','w') as file: file.write(outputLatex) with open('output/notLatex/random/data_dimensions.txt','w') as file: file.write(table.draw())
def sg_sample_rows_table(): """ Tabulates the number of rows in each random sample by using different percentages in random sampling. Outputs table to a Tex file which can be imported to the Research Article. """ header = [''] # rowsDick < key, value> = <dataset name, number of samples> rowsDict = defaultdict(int) for k in datasets.keys(): X = read_matrix_from_file(datasets[k]) rowsDict[k] = X.shape[0] header.append(k) # results for each row appended to rows rows = [header] for i in range(10, 110, 10): # results from each percentage becomes a row row = [] row.append(i) for k in rowsDict.keys(): row.append(int(rowsDict[k] * i * 0.01)) rows.append(row) # Output tabulated results to Tex file table = Texttable() table.set_cols_align(["c"] * len(rows[0])) table.set_deco(Texttable.HEADER) table.add_rows(rows) outputLatex = latextable.draw_latex(table, caption="Number of rows when different percentages are used to select subsets of the datasets via random sampling with replacement") # save output to latex file in output/random/ds/glg{dataset}.tex with open('output/real/sg_Sizes.tex', 'w') as file: file.write(outputLatex) with open('output/notLatex/real/sg_Sizes.txt','w') as file: file.write(table.draw())
def sg_compare_duration_table(ds, duration, percentages): """ Accepts results containing computation time data and the name of the dataset the stochastic greedy algorithms selected the features from. Tabulates results and outputs to a Tex file which can be imported to the Research Article. Args: ds (String): Dataset feature selection was performed on. duration (Dictionary): Key - algorithm type, Value - Dictionary: Key - percentage, Value - The computational time to perform the algorithm given in a list with percentage used in random sampling percentages (List of floats): Percentages values used in random sampling. """ # Table Header header =['Algorithms'] for p in percentages: header.append(p) # Table Rows rows = [header] for k in duration.keys(): row = [k] for p in percentages: row.append(str(duration[k][p])) rows.append(row) # Construct Table table = Texttable() table.set_cols_dtype(['t','t','t']) table.set_cols_align(["c"] * len(rows[0])) table.set_deco(Texttable.HEADER) table.add_rows(rows, header=True) # Create output outputLatex = latextable.draw_latex(table, caption="Computation time (in seconds) to perform Stochastic Greedy feature selection on the {0} dataset with different percentages used to sample the data".format(ds)) # Write to file with open('output/real/{0}/sg_compareDurationTable.tex'.format(ds), 'w') as file: file.write(outputLatex) with open('output/notLatex/real/{0}/sg_compareDurationTable.txt'.format(ds),'w') as file: file.write(table.draw())
def build_main_table(dataset: str, experiment: dict) -> str: t = Texttable() t.set_deco(t.HEADER) cats = ["LOC", "PER", "ORG"] miscdata = dataset != "WikiANN" # Fix miscfuckeri row = ["Model name", "Trained on", "F1"] if miscdata: row.append(r"F1 {\tiny\textdiscount MISC}") cats.append("MISC") row += ["Prec.", "Rec."] row += cats t.set_cols_align(["l", "l"] + ["c"] * (3 + len(cats) + int(miscdata))) t.set_cols_dtype(["t"] * len(row)) # Dont overwrite my formatting pls t.header(row) for m, mname in MODEL_NAMES.items(): v = experiment[m] row = [mname, MODEL_TRAINDATA[m]] if miscdata: row.append( f1f(v["stats"]["micro avg"]["f1-score"] ) if v["stats"]["MISC"]["f1-score"] else "-") row.append(f1f(v["stats_nomisc"]["micro avg"]["f1-score"])) row.append(f1f(v["stats"]["micro avg"]["precision"])) row.append(f1f(v["stats"]["micro avg"]["recall"])) row += [f1f(v["stats"][c]["f1-score"] or "-") for c in cats] t.add_row(row) #print(t.draw()) out = draw_latex( t, caption= f"F1\pro-scores of Danish NER models of the {dataset} data-set consisting of {v['N']} sentences.", label=f"tab:{dataset}") print(out) return out
dataframe = dataframe.replace("?", np.nan).dropna() classes = set(dataframe[dataframe.columns[0]]) classToInt = dict(zip(classes, range(len(classes)))) dataframe['class'] = dataframe['class'].apply(lambda x: classToInt[x]) dataset = dataframe.to_numpy() TARGET = dataset[:, 0].astype(int) DATA = dataset[:, 1:].astype(float) accuracy, algorithm_used, algorithm_best = accuracy_test_combine_algo_cv(DATA, TARGET, dataset_name, n_splits=5, stratified=True, balanced=True) table_accuracy.add_row(accuracy) print(accuracy) table_algo_best.add_row(algorithm_best) print(algorithm_best) table_algo_used.add_row(algorithm_used) print(algorithm_used) benchmark_result.append(accuracy[1]) mean_accuracy = ["Average", np.average(benchmark_result)] median_accuracy = ["Median", np.median(benchmark_result)] table_accuracy.add_row(mean_accuracy) table_accuracy.add_row(median_accuracy) benchmark_results = np.loadtxt("benchmark_results.csv", delimiter=", ") benchmark_results = np.insert(benchmark_results, 0, benchmark_result, axis=1) np.savetxt("benchmark_results_COMB.csv", benchmark_results, delimiter=", ") print(table_accuracy.draw() + "\n") print(draw_latex(table_accuracy) + "\n") print(table_algo_used.draw() + "\n") print(draw_latex(table_algo_used) + "\n") print(table_algo_best.draw() + "\n") print(draw_latex(table_algo_best) + "\n")
TARGET = dataset[:, 0].astype(int) DATA = dataset[:, 1:].astype(float) row.append(len(DATA[0])) row.append(len(DATA)) class0 = [x for x in TARGET if x == 0] class1 = [x for x in TARGET if x == 1] IR = (max(len(class0), len(class1)) / min(len(class0), len(class1))) row.append(IR) file2 = dataset_location + datasets[i + n] dataset_name2 = file2[file2.rfind("/") + 1:][:-4] row.append(dataset_name2) dataframe2 = pd.read_csv(file2, skiprows=0, sep='|') dataframe2 = dataframe2.replace("?", np.nan).dropna() classes2 = set(dataframe2[dataframe2.columns[0]]) classToInt2 = dict(zip(classes2, range(len(classes2)))) dataframe2['class'] = dataframe2['class'].apply(lambda x: classToInt2[x]) dataset2 = dataframe2.to_numpy() TARGET2 = dataset2[:, 0].astype(int) DATA2 = dataset2[:, 1:].astype(float) row.append(len(DATA2[0])) row.append(len(DATA2)) class0 = [x for x in TARGET2 if x == 0] class1 = [x for x in TARGET2 if x == 1] IR = (max(len(class0), len(class1)) / min(len(class0), len(class1))) row.append(IR) table.add_row(row) print(table.draw() + "\n") print(draw_latex(table) + "\n")
def area_over_curve_lp(): def compute_ideal_area(Y, C): len_c = len(numpy.unique(C)) len_y = len(numpy.unique(Y)) p_y_c = numpy.zeros((len_y, len_c)) for c in range(len_c): for y in range(len_y): p_y_c[y, c] = numpy.logical_and(Y == y, C == c).mean() print(p_y_c) # compute desired rate i.e p(y=1|C=c) desired_rate = p_y_c[1, :].mean() errors = p_y_c[1, :] - desired_rate majority_acc = max(numpy.mean(Y == 1), 1 - numpy.mean(Y == 1)) max_dp = demographic_parity_difference(Y, Y, sensitive_features=C) solution = get_optimal_front(Y, C) # add no error and max_dp to the solution solution.append([1, max_dp]) solution = numpy.array(solution) # sort by dp solution = solution[solution[:, 1].argsort()] area = numpy.sum( # acc * dp_next - dp_cur (solution[:-1, 0] - majority_acc) * (solution[1:, 1] - solution[0:-1, 1])) return area, majority_acc, max_dp # Methods methods = [ "fcrl", "cvib_supervised", "lag-fairness", "maxent_arl", "laftr", "adv_forgetting" ] # compute AUC table area = {} for data in ["adult", "health"]: # compute idea areas if data == "adult": adult = load_adult(0.2) Y = adult["test"][2] C = adult["test"][1] elif data == "health": health = load_health(0.2) Y = health["test"][2] C = health["test"][1] norm_area, majority_acc, max_dp = compute_ideal_area(Y, C) area[data] = {} for idx, key in enumerate([ "nn_1_layer", "nn_2_layer", "random_forest", "svm", "logistic_regression" ]): area[data][key] = {} for m in methods: if data == "health" and m == "laftr": continue t = numpy.load(f"result/eval/{data}/{m}.npy", allow_pickle=True).item() df = get_dataframe_from_results(t) # get pareto front pareto = df[[f'{key}_normalized_acc', f'{key}_normalized_dp']].values # drop nan pareto = pareto[~numpy.isnan(pareto).any(axis=1)] pareto = get_pareto_front(pareto) pareto = numpy.array(pareto) pareto = pareto[pareto[:, 1].argsort()] # reject points that have more dp than data THRESH = 1.0 idx = pareto.shape[0] while idx > -1: if pareto[idx - 1, 1] > THRESH * max_dp: idx = idx - 1 else: break pareto = pareto[:idx] if idx == -1: area[data][key][m] = 0 print(f"No point found below dp_max for {m}, {data}") continue # add random acc point, 0 (this works as a reference to create horizontal bars # add max_dp, pareto[-1,0] i.e max acc you can get at data's dp pareto = numpy.concatenate( [[[majority_acc, 0]], pareto, [[pareto[-1, 0], max_dp]]], axis=0) # get area by making rectangle area[data][key][m] = numpy.sum( # acc * dp_next - dp_cur (pareto[:-1, 0] - pareto[0, 0]) * (pareto[1:, 1] - pareto[0:-1, 1])) # normalize area[data][key][m] /= norm_area # dump to table for idx, key in enumerate([ "nn_1_layer", "nn_2_layer", "random_forest", "svm", "logistic_regression" ]): table = Texttable() table.set_cols_align(["l", "c", "c"]) table.header(["Method", "UCI Adult", "Heritage Health"]) for m in methods: if m == "fcrl": table.add_row([ "FCRL (Ours)", area["adult"][key][m], area["health"][key][m] ]) if m == "lag-fairness": table.add_row( ["MIFR", area["adult"][key][m], area["health"][key][m]]) if m == "maxent_arl": table.add_row([ "MaxEnt-ARL", area["adult"][key][m], area["health"][key][m] ]) if m == "cvib_supervised": table.add_row( ["CVIB", area["adult"][key][m], area["health"][key][m]]) if m == "laftr": table.add_row(["LAFTR", area["adult"][key][m], "N/A"]) if m == "adv_forgetting": table.add_row([ "Adversarial Forgetting", area["adult"][key][m], area["health"][key][m] ]) os_utils.safe_makedirs(os.path.join(FIGURES_FOLDER, "table")) with open(os.path.join(FIGURES_FOLDER, "table", f"{key}.better.tex"), 'w') as f: f.write( latextable.draw_latex( table, caption="Area Over Parity Accuracy Curve", label=f"AOPAC_{key}"))