def test_apply_filters(s, t, c, list, min_value, max_value): dataset1 = DataSet(s) dataset1.add_table(t, [c]) for v in list: dataset1.tables[t].columns[c].add_value(v) filter = dict() filter["table"] = t filter["column"] = c filter["from"] = min_value filter["to"] = max_value filtered_dataset = dataset1.apply_filters([filter, filter]) schema = filtered_dataset.get_schema() table_and_column_in_schema = t in schema and schema[t] and c in schema[t] column_has_values_inside_range = ( schema[t][c]["count"] > 0 and schema[t][c]["min_value"] >= min_value and schema[t][c]["max_value"] <= max_value) column_is_empty = schema[t][c]["count"] == 0 wrong_filter_range = min_value > max_value assert table_and_column_in_schema and (column_has_values_inside_range or column_is_empty or wrong_filter_range)
def test_apply_filters(s, t, c, list, min_value, max_value): dataset1 = DataSet(s) dataset1.add_table(t, [c]) for v in list: dataset1.tables[t].columns[c].add_value(v, v) filter = dict() filter["table"] = t filter["column"] = c filter["from"] = min_value filter["to"] = max_value filtered_dataset = dataset1.apply_filters([filter, filter]) schema = filtered_dataset.get_schema() assert t in schema assert schema[t] assert c in schema[t] assert "count" in schema[t][c] filteredItemsCount = schema[t][c]["count"] if filteredItemsCount > 0 and min_value <= max_value: assert schema[t][c]["min_value"] >= min_value assert schema[t][c]["max_value"] <= max_value elif filteredItemsCount == 0: assert schema[t][c]["count"] == 0 else: assert schema[t][c]["count"] == len(list)
def detect_classifier(ce_matrix): cbn = load_xml_to_cbn (os.path.join (src_path, '../data/adult/adult.xml')) A1 = cbn.v['age'] A2 = cbn.v['education'] S = cbn.v['sex'] M1 = cbn.v['workclass'] M2 = cbn.v['marital-status'] N = cbn.v['hours'] Y = cbn.v['income'] for i in [0, 1, 2, 3]: # two datasets generated by two methods test = DataSet (pd.read_csv ('temp/adult_binary_test_prediction%d.csv' % i)) for j, label in enumerate (['LR', 'SVM']): # two classifiers # modify cpt of label before detect for a1, a2, n, m1, m2, s, y in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), M1.domains.get_all (), M2.domains.get_all (), S.domains.get_all (), Y.domains.get_all ()): cbn.set_conditional_prob (Event ({Y: y}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}), test.get_conditional_prob (Event ({label: y}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}))) cbn.build_joint_table () for k, (a1prime, a2prime, m1prime, m2prime) in enumerate (product ([0, 1], [0, 1], [0, 1], [0, 1])): p_u, p_l = detect_after_remove (cbn=cbn, s=spos, sprime=sneg, y=1, a1prime=a1prime, a2prime=a2prime, m1prime=m1prime, m2prime=m2prime) p = detect_after_remove (cbn=cbn, s=sneg, sprime=sneg, y=1, a1prime=a1prime, a2prime=a2prime, m1prime=m1prime, m2prime=m2prime) ce_matrix.iloc[j * 32 + k, 2 * i:2 * i + 2] = [p_u - p, p_l - p] for k, (a1prime, a2prime, m1prime, m2prime) in enumerate (product ([0, 1], [0, 1], [0, 1], [0, 1])): p_u, p_l = detect_after_remove (cbn=cbn, s=sneg, sprime=spos, y=1, a1prime=a1prime, a2prime=a2prime, m1prime=m1prime, m2prime=m2prime) p = detect_after_remove (cbn=cbn, s=spos, sprime=spos, y=1, a1prime=a1prime, a2prime=a2prime, m1prime=m1prime, m2prime=m2prime) ce_matrix.iloc[j * 32 + k + 16, 2 * i:2 * i + 2] = [p_u - p, p_l - p]
def test_apply_filters(s, t, c, list, min_value, max_value): dataset1 = DataSet(s) dataset1.add_table(t, [c]) for v in list: dataset1.tables[t].columns[c].add_value(v, v) filter = dict() filter["table"] = t filter["column"] = c filter["from"] = min_value filter["to"] = max_value filtered_dataset = dataset1.apply_filters([filter, filter]) schema = filtered_dataset.get_schema() assert t in schema assert schema[t] assert c in schema[t] assert "count" in schema[t][c] filteredItemsCount = schema[t][c]["count"] if filteredItemsCount > 0 and min_value <= max_value: assert schema[t][c]["min_value"] >= min_value assert schema[t][c]["max_value"] <= max_value elif filteredItemsCount == 0: assert schema[t][c]["count"] == 0 else: assert schema[t][c]["count"] == len(list)
def get_fits_dataset(destination, dsId, table_ids): hdulist = fits.open(destination) dataset = DataSet(dsId) for t in range(len(hdulist)): if isinstance(hdulist[t], fits.hdu.table.BinTableHDU): table_id = table_ids[t] header_names = hdulist[t].columns.names tbdata = hdulist[t].data dataset.add_table(table_id, header_names) for i in range(len(header_names)): header_name = header_names[i] dataset.tables[table_id].columns[ header_name].values = np.append([], tbdata.field(i)) else: logging.debug("No valid data on: %s" % t) logging.debug("Type of Data: %s" % type(hdulist[t])) hdulist.close() logging.debug("Read fits file successfully: %s" % destination) return dataset
def test_get_shema(s, t, c, v): dataset = DataSet(s) dataset.add_table(t, [c]) dataset.tables[t].columns[c].add_value(v) schema = dataset.get_schema() assert t in schema and schema[t] and c in schema[t] and schema[t][c][ "id"] == c and "count" in schema[t][c] and schema[t][c]["count"] == 1
def test_clone(s, t, c, v, e): dataset1 = DataSet(s) dataset1.add_table(t, [c]) dataset1.tables[t].columns[c].add_value(v, e) schema1 = dataset1.get_schema() dataset2 = dataset1.clone() schema2 = dataset2.get_schema() assert schema1 == schema2
def test_get_schema(s, t, c, v, e): dataset = DataSet(s) dataset.add_table(t, [c]) dataset.tables[t].columns[c].add_value(v, e) schema = dataset.get_schema() assert t in schema assert schema[t] assert c in schema[t] assert schema[t][c]["id"] == c assert "count" in schema[t][c] assert schema[t][c]["count"] == 1
def get_txt_dataset(destination, table_id, header_names): data = np.loadtxt(destination) dataset = DataSet( table_id ) dataset.add_table( table_id, header_names ) for i in range(len(header_names)): header_name = header_names[i] dataset.tables[table_id].columns[header_name].values = data[0:len(data),i] logging.debug("Read txt file successfully: %s" % destination) return dataset
def get_fits_dataset(destination, table_id): hdulist = fits.open(destination) tbdata = hdulist[1].data header_names = hdulist[1].columns.names dataset = DataSet( table_id ) dataset.add_table( table_id, header_names ) for i in range(len(header_names)): header_name = header_names[i] dataset.tables[table_id].columns[header_name].values = tbdata.field(i) logging.debug("Read lc file successfully: %s" % destination) return dataset
def get_fits_dataset(destination, table_id): hdulist = fits.open(destination) tbdata = hdulist[1].data header_names = hdulist[1].columns.names dataset = DataSet(table_id) dataset.add_table(table_id, header_names) for i in range(len(header_names)): header_name = header_names[i] dataset.tables[table_id].columns[header_name].values = tbdata.field(i) logging.debug("Read lc file successfully: %s" % destination) return dataset
def get_txt_dataset(destination, table_id, header_names): data = np.loadtxt(destination) dataset = DataSet(table_id) dataset.add_table(table_id, header_names) for i in range(len(header_names)): header_name = header_names[i] dataset.tables[table_id].columns[header_name].values = data[ 0:len(data), i] logging.debug("Read txt file successfully: %s" % destination) return dataset
def getmenu(): if not os.path.isfile(paths.DATASETS_JSON): dataset_info = [] # Read in CSV datasets_csv = pd.read_csv(paths.DATASETS) # Get title, filename, id, and label for each data set and add it to the collection for i, row in datasets_csv.iterrows(): dataset_title = row["Title"] dataset_filename = row["FileName"] dataset_id = row["ID"] dataset_label = row["Label"] dataset = DataSet(dataset_filename, dataset_title, dataset_id, dataset_label) dataset_info.append(dataset) # Save the collection as JSON and return it datasets = DataSets(dataset_info=dataset_info) datasets_json = jsonpickle.encode(datasets) # Save the serialized JSON to a file with open(paths.DATASETS_JSON, 'w') as file: file.write(datasets_json) else: with open(paths.DATASETS_JSON, 'r') as serialized_file: json_str = serialized_file.read() datasets_json = jsonpickle.decode(json_str) return datasets_json
def test_join(s, t, c, v0, e0, v1, e1): dataset1 = DataSet(s) dataset1.add_table(t, [c]) dataset1.tables[t].columns[c].add_value(v0, e0) dataset2 = DataSet(s) dataset2.add_table(t, [c]) dataset2.tables[t].columns[c].add_value(v1, e0) dataset1 = dataset1.join(dataset2) schema = dataset1.get_schema() assert t in schema assert schema[t] assert c in schema[t] assert "count" in schema[t][c] assert schema[t][c]["count"] == 2
def get_txt_dataset(destination, table_id, header_names): data = np.loadtxt(destination) dataset = DataSet(table_id) dataset.add_table(table_id, header_names) # Column1, Column1Err, Column2, Column2Err .. header order expected for i in range(len(header_names)): header_name = header_names[i] column = dataset.tables[table_id].columns[header_name] column.values = data[0:len(data), i * 2] column.error_values = data[0:len(data), (i * 2) + 1] logging.debug("Read txt file successfully: %s" % destination) return dataset
def test_clone(s, t, c, v, e): dataset1 = DataSet(s) dataset1.add_table(t, [c]) dataset1.tables[t].columns[c].add_value(v, e) schema1 = dataset1.get_schema() dataset2 = dataset1.clone() schema2 = dataset2.get_schema() assert schema1 == schema2
def test_join(s, t, c, v0, e0, v1, e1): dataset1 = DataSet(s) dataset1.add_table(t, [c]) dataset1.tables[t].columns[c].add_value(v0, e0) dataset2 = DataSet(s) dataset2.add_table(t, [c]) dataset2.tables[t].columns[c].add_value(v1, e0) dataset1 = dataset1.join(dataset2) schema = dataset1.get_schema() assert t in schema assert schema[t] assert c in schema[t] assert "count" in schema[t][c] assert schema[t][c]["count"] == 2
def get_fits_dataset_with_stingray(destination, dsId='FITS', hduname='EVENTS', column='TIME'): # Gets columns from fits hdu table columns = get_fits_table_column_names(destination, hduname) # Prepares additional_columns additional_columns = [] for i in range(len(columns)): if columns[i] != column: additional_columns = np.append(additional_columns, columns[i]) # Reads fits data fits_data = load_events_and_gtis(destination, additional_columns=additional_columns) # Creates the dataset dataset = DataSet(dsId) #Fills Hdu table dataset.add_table(hduname, columns) dataset.tables[hduname].columns[column].add_values(fits_data.ev_list) for i in range(len(additional_columns)): column = additional_columns[i] dataset.tables[hduname].columns[column].add_values( fits_data.additional_data[column]) #Fills Gtis table gti_columns = ["START", "STOP"] gti_start = fits_data.gti_list[:, 0] gti_end = fits_data.gti_list[:, 1] dataset.add_table("GTI", gti_columns) dataset.tables["GTI"].columns[gti_columns[0]].add_values(gti_start) dataset.tables["GTI"].columns[gti_columns[1]].add_values(gti_end) logging.debug("Read fits with stingray file successfully: %s" % destination) return dataset
def test_add_table(s, t, c): dataset = DataSet(s) dataset.add_table(t, [c]) assert len(dataset.tables) == 1
def detect_classifier(ce_matrix): cbn = load_xml_to_cbn( os.path.join(src_path, '../data/synthetic/ProbabilisticBayesianModel.xml')) A = cbn.v['A'] S = cbn.v['S'] N = cbn.v['N'] M = cbn.v['M'] Y = cbn.v['Y'] for i in [0, 1, 2, 3]: # two datasets generated by two methods test = DataSet(pd.read_csv('temp/synthetic_test_prediction%d.csv' % i)) for j, label in enumerate(['LR', 'SVM']): # two classifiers # modify cpt of label before detect for a, n, m, s, y in product(A.domains.get_all(), N.domains.get_all(), M.domains.get_all(), S.domains.get_all(), Y.domains.get_all()): cbn.set_conditional_prob( Event({Y: y}), Event({ A: a, M: m, N: n, S: s }), test.get_conditional_prob( Event({label: y}), Event({ 'A': a, 'M': m, 'N': n, 'S': s }))) cbn.build_joint_table() for k, (aprime, mprime) in enumerate(product([0, 1], [0, 1])): p_u, p_l = detect_after_remove(cbn=cbn, s=spos, sprime=sneg, y=1, aprime=aprime, mprime=mprime) p = detect_after_remove(cbn=cbn, s=sneg, sprime=sneg, y=1, aprime=aprime, mprime=mprime) ce_matrix.iloc[j * 8 + k, 3 * i:3 * i + 2] = [p_u - p, p_l - p] for k, (aprime, mprime) in enumerate(product([0, 1], [0, 1])): p_u, p_l = detect_after_remove(cbn=cbn, s=sneg, sprime=spos, y=1, aprime=aprime, mprime=mprime) p = detect_after_remove(cbn=cbn, s=spos, sprime=spos, y=1, aprime=aprime, mprime=mprime) ce_matrix.iloc[j * 8 + k + 4, 3 * i:3 * i + 2] = [p_u - p, p_l - p]
def test_init(s): dataset = DataSet(s) assert dataset assert dataset.id == s
def pearl_detect_classifier(ce_matrix): cbn = load_xml_to_cbn( os.path.join(src_path, '../data/synthetic/DeterministicBayesianModel.xml')) UA = cbn.v['UA'] UN = cbn.v['UN'] UM = cbn.v['UM'] US = cbn.v['US'] UY = cbn.v['UY'] A = cbn.v['A'] S = cbn.v['S'] N = cbn.v['N'] M = cbn.v['M'] Y = cbn.v['Y'] cbn.build_joint_table() event = cbn.jpt.groupby( Event({ UA: 1, UN: 1, UM: 1, US: 1, A: 1, M: 1, S: 1 }).keys()) condition = cbn.jpt.groupby(Event({A: 1, M: 1, S: 1}).keys()) def pearl_after_remove_(s, sprime, y, aprime, mprime): p = 0.0 for ua, un, um, us in product(UA.domains.get_all(), UN.domains.get_all(), UM.domains.get_all(), US.domains.get_all()): e = Event({ UA: ua, UN: un, UM: um, US: us, A: aprime, M: mprime, S: sprime }) c = Event({A: aprime, M: mprime, S: sprime}) ps = event.get_group(tuple( e.values()))['prob'].sum() / condition.get_group( tuple(c.values()))['prob'].sum() for a, n, m in product(A.domains.get_all(), N.domains.get_all(), M.domains.get_all()): p += cbn.find_prob (Event ({A: a}), Event ({UA: ua})) * \ cbn.find_prob (Event ({M: m}), Event ({S: s, A: a, UM: um})) * \ cbn.find_prob (Event ({N: n}), Event ({S: s, A: a, UN: un})) * \ cbn.find_prob (Event ({Y: y}), Event ({S: s, A: a, N: n, M: m, UY: 1})) * \ ps return p for i in [0, 1, 2, 3]: # two datasets generated by two methods test = DataSet(pd.read_csv('temp/synthetic_test_prediction%d.csv' % i)) for j, label in enumerate(['LR', 'SVM']): # two classifiers # modify cpt of label before detect for a, n, m, s, y in product(A.domains.get_all(), N.domains.get_all(), M.domains.get_all(), S.domains.get_all(), Y.domains.get_all()): cbn.set_conditional_prob( Event({Y: y}), Event({ A: a, M: m, N: n, S: s, UY: 1 }), test.get_conditional_prob( Event({label: y}), Event({ 'A': a, 'M': m, 'N': n, 'S': s }))) for k, (aprime, mprime) in enumerate(product([0, 1], [0, 1])): ce = pearl_after_remove_ (s=spos, sprime=sneg, y=1, aprime=aprime, mprime=mprime) - \ pearl_after_remove_ (s=sneg, sprime=sneg, y=1, aprime=aprime, mprime=mprime) ce_matrix.iloc[j * 8 + k, 3 * i + 2] = ce for k, (aprime, mprime) in enumerate(product([0, 1], [0, 1])): ce = pearl_after_remove_ (s=sneg, sprime=spos, y=1, aprime=aprime, mprime=mprime) - \ pearl_after_remove_ (s=spos, sprime=spos, y=1, aprime=aprime, mprime=mprime) ce_matrix.iloc[j * 8 + k + 4, 3 * i + 2] = ce
def method3(acc_matrix): df_train = pd.read_csv ('temp/adult_binary_train_prediction0.csv') # df_train = pd.concat ([df_train] * 10, ignore_index=True) train = DataSet (df_train) df_test = pd.read_csv ('temp/adult_binary_test_prediction0.csv') df_test = pd.concat ([df_test] * 3, ignore_index=True) test = DataSet (df_test) acc = [] for name in ['LR', 'SVM']: probabilistic_cbn = load_xml_to_cbn (os.path.join (src_path, '../data/adult/adult.xml')) def find_condition_prob(e, t): return probabilistic_cbn.find_prob (e, t) def get_loc(e): return probabilistic_cbn.get_loc (e) A1 = probabilistic_cbn.v['age'] A2 = probabilistic_cbn.v['education'] S = probabilistic_cbn.v['sex'] M1 = probabilistic_cbn.v['workclass'] M2 = probabilistic_cbn.v['marital-status'] N = probabilistic_cbn.v['hours'] Y = probabilistic_cbn.v['income'] YH = Variable (name=name, index=Y.index + 1, domains=Y.domains) probabilistic_cbn.v[(YH.index, YH.name)] = YH YT = Variable (name=name + "M", index=Y.index + 2, domains=Y.domains) probabilistic_cbn.v[(YT.index, YT.name)] = YT # build linear loss function C_vector = np.zeros ((2 ** 8 + 2 ** 8 // 4, 1)) for a1, a2, n, m1, m2, s in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), M1.domains.get_all (), M2.domains.get_all (), S.domains.get_all ()): p_x_s = train.get_marginal_prob (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s})) p_yh_1_y = p_x_s * train.count (Event ({Y: 0, YH: 0}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}), 'notequal') loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: 0, YT: 0})) C_vector[loc] = p_yh_1_y * train.get_conditional_prob (Event ({YH: 0}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s})) loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: 1, YT: 1})) C_vector[loc] = p_yh_1_y * train.get_conditional_prob (Event ({YH: 1}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s})) p_yh__y = p_x_s * train.count (Event ({Y: 0, YH: 0}), Event ({A1: a1, A2: a2, M1: m1, M2: m2, N: n, S: s}), 'equal') loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: 0, YT: 1})) C_vector[loc] = p_yh__y * train.get_conditional_prob (Event ({YH: 0}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s})) loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: 1, YT: 0})) C_vector[loc] = p_yh__y * train.get_conditional_prob (Event ({YH: 1}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s})) # the inequality of max and min G_matrix_1 = np.zeros ((2 ** 8, 2 ** 8 + 2 ** 8 // 4)) h_1 = np.zeros (2 ** 8) # max i = 0 for a1, a2, n, s, yt in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), S.domains.get_all (), YT.domains.get_all ()): for m1, m2 in product (M1.domains.get_all (), M2.domains.get_all ()): for yh in YH.domains.get_all (): loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt})) G_matrix_1[i, loc] = train.get_conditional_prob (Event ({YH: yh}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s})) loc = get_loc (Event ({A1: a1, A2: a2, N: n, S: s, YT: yt})) G_matrix_1[i, 2 ** 8 + loc] = -1 i += 1 # min assert i == 2 ** 8 // 2 for a1, a2, n, s, yt in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), S.domains.get_all (), YT.domains.get_all ()): for m1, m2 in product (M1.domains.get_all (), M2.domains.get_all ()): for yh in YH.domains.get_all (): loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt})) G_matrix_1[i, loc] = -train.get_conditional_prob (Event ({YH: yh}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s})) loc = get_loc (Event ({A1: a1, A2: a2, N: n, S: s, YT: yt})) G_matrix_1[i, 2 ** 8 + 2 ** 8 // 8 + loc] = 1 i += 1 # build counterfactual fairness constraints G_matrix_2 = np.zeros ((2 ** 4 * 2, 2 ** 8 + 2 ** 8 // 4)) h_2 = np.ones (2 ** 4 * 2) * tau i = 0 for a1, a2, m1, m2 in product (A1.domains.get_all (), A2.domains.get_all (), M1.domains.get_all (), M2.domains.get_all ()): for n in N.domains.get_all (): loc = get_loc (Event ({A1: a1, A2: a2, N: n, S: spos, YT: yt_pos})) G_matrix_2[i, 2 ** 8 + loc] = find_condition_prob (Event ({N: n}), Event ({A1: a1, A2: a2, M1: m1, M2: m2, S: spos})) for yh in YH.domains.get_all (): loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: sneg, YH: yh, YT: yt_pos})) G_matrix_2[i, loc] = -find_condition_prob (Event ({N: n}), Event ({A1: a1, A2: a2, M1: m1, M2: m2, S: sneg})) \ * train.get_conditional_prob (Event ({YH: yh}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: sneg})) i += 1 assert i == 2 ** 4 for a1, a2, m1, m2 in product (A1.domains.get_all (), A2.domains.get_all (), M1.domains.get_all (), M2.domains.get_all ()): for n in N.domains.get_all (): loc = get_loc (Event ({A1: a1, A2: a2, N: n, S: spos, YT: yt_pos})) G_matrix_2[i, 2 ** 8 + 2 ** 8 // 8 + loc] = -find_condition_prob (Event ({N: n}), Event ({A1: a1, A2: a2, M1: m1, M2: m2, S: spos})) for yh in YH.domains.get_all (): loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: sneg, YH: yh, YT: yt_pos})) G_matrix_2[i, loc] = find_condition_prob (Event ({N: n}), Event ({A1: a1, A2: a2, M1: m1, M2: m2, S: sneg})) \ * train.get_conditional_prob (Event ({YH: yh}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: sneg})) i += 1 ########### # mapping in [0, 1] G_matrix_3 = np.zeros ((2 * (2 ** 8 + 2 ** 8 // 4), 2 ** 8 + 2 ** 8 // 4)) h_3 = np.zeros (2 * (2 ** 8 + 2 ** 8 // 4)) for i in range (2 ** 8 + 2 ** 8 // 4): G_matrix_3[i, i] = 1 h_3[i] = 1 G_matrix_3[2 ** 8 + 2 ** 8 // 4 + i, i] = -1 h_3[2 ** 8 + 2 ** 8 // 4 + i] = 0 # sum = 1 A_matrix = np.zeros ((2 ** 8 // 2, 2 ** 8 + 2 ** 8 // 4)) b = np.ones (2 ** 8 // 2) i = 0 for a1, a2, n, m1, m2, s, yh in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), M1.domains.get_all (), M2.domains.get_all (), S.domains.get_all (), YH.domains.get_all ()): for yt in YT.domains.get_all (): A_matrix[i, get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt}))] = 1 i += 1 assert i == 2 ** 8 // 2 # combine the inequality constraints G_matrix = np.vstack ([G_matrix_1, G_matrix_2, G_matrix_3]) h = np.hstack ([h_1, h_2, h_3]) # Test # print (np.linalg.matrix_rank (A_matrix), A_matrix.shape[0]) # print (np.linalg.matrix_rank (np.vstack ([A_matrix, G_matrix])), A_matrix.shape[1]) # def check(): # sol = np.zeros (2 ** 8 + 2 ** 8 // 4) # for a1, a2, n, m1, m2, s, yh, yt in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), M1.domains.get_all (), M2.domains.get_all (), # S.domains.get_all (), YH.domains.get_all (), YT.domains.get_all ()): # if yh.name == yt.name: # sol[get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt}))] = 1.0 # else: # sol[get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt}))] = 0.0 # # for a1, a2, n, s, yt in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), S.domains.get_all (), YT.domains.get_all ()): # p_min = 1 # p_max = 0 # for m1, m2 in product (M1.domains.get_all (), M2.domains.get_all ()): # p = 0.0 # for yh in YH.domains.get_all (): # p = train.get_conditional_prob (Event ({YH: yh}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s})) \ # * sol[get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt}))] # if p < p_min: # p_min = p # if p > p_max: # p_max = p # loc = get_loc (Event ({A1: a1, A2: a2, N: n, S: s, YT: yt})) # sol[2 ** 8 + loc] = p_max # sol[2 ** 8 + 2 ** 8 // 8 + loc] = p_min # # np.dot (G_matrix_2, sol) # check () # solver solvers.options['show_progress'] = False sol = solvers.lp (c=matrix (C_vector), G=matrix (G_matrix), h=matrix (h), A=matrix (A_matrix), b=matrix (b), solver=solvers ) mapping = np.array (sol['x']) # build the post-processing result in training and testing train.df.loc[:, name + 'M'] = train.df[name] test.df[name + 'M'] = test.df[name] for a1, a2, n, m1, m2, s, yh, yt in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), M1.domains.get_all (), M2.domains.get_all (), S.domains.get_all (), YH.domains.get_all (), YT.domains.get_all ()): if yh.name != yt.name: p = mapping[get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt})), 0] train.random_assign (Event ({YH: yh, A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}), Event ({YT: yt}), p) test.random_assign (Event ({YH: yh, A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}), Event ({YT: yt}), p) train.df[name] = train.df[name + 'M'] train.df.drop ([name + 'M'], axis=1) test.df[name] = test.df[name + 'M'] test.df.drop ([name + 'M'], axis=1) acc.append (accuracy_score (train.df[name], train.df[Y.name])) acc.append (accuracy_score (test.df[name], test.df[Y.name])) acc_matrix.iloc[:, 3] = acc train.df.to_csv ('temp/adult_binary_train_prediction3.csv', index=False) test.df.to_csv ('temp/adult_binary_test_prediction3.csv', index=False)
from model.dataset import DataSet import time file_folder = '_'.join([str(i) for i in time.localtime(time.time())][:3]) dir_list = ['2020_7_9'] dataset = DataSet(dir_list) attr_value = dataset.get_date_stock("2020_7_9", "光大证券", "现手") print("len:", attr_value) # 对某一天的所有股票按照交易买卖比进行聚类
def test_add_table(s, t, c): dataset = DataSet(s) dataset.add_table(t, [c]) assert len(dataset.tables) == 1
def method3(acc_matrix): df_train = pd.read_csv('temp/synthetic_train_prediction0.csv') train = DataSet(df_train) df_test = pd.read_csv('temp/synthetic_test_prediction0.csv') test = DataSet(df_test) acc = [] for name in ['LR', 'SVM']: probabilistic_cbn = load_xml_to_cbn( os.path.join(src_path, '../data/synthetic/ProbabilisticBayesianModel.xml')) def find_condition_prob(e, t): return probabilistic_cbn.find_prob(e, t) def get_loc(e): return probabilistic_cbn.get_loc(e) A = probabilistic_cbn.v['A'] S = probabilistic_cbn.v['S'] N = probabilistic_cbn.v['N'] M = probabilistic_cbn.v['M'] Y = probabilistic_cbn.v['Y'] YH = Variable(name='YH', index=Y.index + 1, domains=Y.domains) probabilistic_cbn.v[(YH.index, YH.name)] = YH YT = Variable(name='YT', index=Y.index + 2, domains=Y.domains) probabilistic_cbn.v[(YT.index, YT.name)] = YT # build linear loss function C_vector = np.zeros((2**6 + 2**6 // 2, 1)) for a, n, m, s in product(A.domains.get_all(), N.domains.get_all(), M.domains.get_all(), S.domains.get_all()): p_x_s = train.get_marginal_prob( Event({ 'A': a, 'M': m, 'N': n, 'S': s })) p_yh_1_y = p_x_s * train.count( Event({ 'Y': 0, name: 0 }), Event({ 'A': a, 'M': m, 'N': n, 'S': s }), 'notequal') loc = get_loc(Event({A: a, M: m, N: n, S: s, YH: 0, YT: 0})) C_vector[loc] = p_yh_1_y * train.get_conditional_prob( Event({name: 0}), Event({ 'A': a, 'M': m, 'N': n, 'S': s })) loc = get_loc(Event({A: a, M: m, N: n, S: s, YH: 1, YT: 1})) C_vector[loc] = p_yh_1_y * train.get_conditional_prob( Event({name: 1}), Event({ 'A': a, 'M': m, 'N': n, 'S': s })) p_yh__y = p_x_s * train.count( Event({ 'Y': 0, name: 0 }), Event({ 'A': a, 'M': m, 'N': n, 'S': s }), 'equal') loc = get_loc(Event({A: a, M: m, N: n, S: s, YH: 0, YT: 1})) C_vector[loc] = p_yh__y * train.get_conditional_prob( Event({name: 0}), Event({ 'A': a, 'M': m, 'N': n, 'S': s })) loc = get_loc(Event({A: a, M: m, N: n, S: s, YH: 1, YT: 0})) C_vector[loc] = p_yh__y * train.get_conditional_prob( Event({name: 1}), Event({ 'A': a, 'M': m, 'N': n, 'S': s })) # the inequality of max and min G_matrix_1 = np.zeros((2**6, 2**6 + 2**6 // 2)) h_1 = np.zeros(2**6) # max i = 0 for a, n, s, yt in product(A.domains.get_all(), N.domains.get_all(), S.domains.get_all(), YT.domains.get_all()): for m in M.domains.get_all(): for yh in YH.domains.get_all(): loc = get_loc( Event({ A: a, M: m, N: n, S: s, YH: yh, YT: yt })) G_matrix_1[i, loc] = train.get_conditional_prob( Event({name: yh}), Event({ 'A': a, 'M': m, 'N': n, 'S': s })) loc = get_loc(Event({A: a, N: n, S: s, YT: yt})) G_matrix_1[i, 2**6 + loc] = -1 i += 1 # min assert i == 2**6 // 2 for a, n, s, yt in product(A.domains.get_all(), N.domains.get_all(), S.domains.get_all(), YT.domains.get_all()): for m in M.domains.get_all(): for yh in YH.domains.get_all(): loc = get_loc( Event({ A: a, M: m, N: n, S: s, YH: yh, YT: yt })) G_matrix_1[i, loc] = -train.get_conditional_prob( Event({name: yh}), Event({ 'A': a, 'M': m, 'N': n, 'S': s })) loc = get_loc(Event({A: a, N: n, S: s, YT: yt})) G_matrix_1[i, 2**6 + 2**6 // 4 + loc] = 1 i += 1 # build counterfactual fairness constraints G_matrix_2 = np.zeros((2**2 * 2, 2**6 + 2**6 // 2)) h_2 = np.ones(2**2 * 2) * tau i = 0 for a, m in product(A.domains.get_all(), M.domains.get_all()): for n in N.domains.get_all(): loc = get_loc(Event({A: a, N: n, S: spos, YT: yt_pos})) G_matrix_2[i, 2**6 + loc] = find_condition_prob( Event({N: n}), Event({ A: a, S: spos })) for yh in YH.domains.get_all(): loc = get_loc( Event({ A: a, M: m, N: n, S: sneg, YH: yh, YT: yt_pos })) G_matrix_2[i, loc] = -find_condition_prob (Event ({N: n}), Event ({A: a, S: sneg})) \ * train.get_conditional_prob (Event ({name: yh}), Event ({'A': a, 'M': m, 'N': n, 'S': sneg})) i += 1 assert i == 2**2 for a, m in product(A.domains.get_all(), M.domains.get_all()): for n in N.domains.get_all(): loc = get_loc(Event({A: a, N: n, S: spos, YT: yt_pos})) G_matrix_2[i, 2**6 + 2**6 // 4 + loc] = -find_condition_prob( Event({N: n}), Event({ A: a, S: spos })) for yh in YH.domains.get_all(): loc = get_loc( Event({ A: a, M: m, N: n, S: sneg, YH: yh, YT: yt_pos })) G_matrix_2[i, loc] = find_condition_prob (Event ({N: n}), Event ({A: a, S: sneg})) \ * train.get_conditional_prob (Event ({name: yh}), Event ({'A': a, 'M': m, 'N': n, 'S': sneg})) i += 1 ########### # mapping in [0, 1] G_matrix_3 = np.zeros(((2**6 + 2**6 // 2) * 2, 2**6 + 2**6 // 2)) h_3 = np.zeros((2**6 + 2**6 // 2) * 2) for i in range(2**6 + 2**6 // 2): G_matrix_3[i, i] = 1 h_3[i] = 1 G_matrix_3[2**6 + 2**6 // 2 + i, i] = -1 h_3[2**6 + 2**6 // 2 + i] = 0 # sum = 1 A_matrix = np.zeros((2**6 // 2, 2**6 + 2**6 // 2)) b = np.ones(2**6 // 2) i = 0 for a, n, m, s, yh in product(A.domains.get_all(), N.domains.get_all(), M.domains.get_all(), S.domains.get_all(), YH.domains.get_all()): for yt in YT.domains.get_all(): A_matrix[ i, get_loc(Event({ A: a, M: m, N: n, S: s, YH: yh, YT: yt }))] = 1 i += 1 assert i == 2**6 // 2 # combine the inequality constraints G_matrix = np.vstack([G_matrix_1, G_matrix_2, G_matrix_3]) h = np.hstack([h_1, h_2, h_3]) # solver solvers.options['show_progress'] = False sol = solvers.lp(c=matrix(C_vector), G=matrix(G_matrix), h=matrix(h), A=matrix(A_matrix), b=matrix(b), solver=solvers) mapping = np.array(sol['x']) # build the post-processing result in training and testing train.df[name + '1'] = train.df[name] test.df[name + '1'] = test.df[name] for a, n, m, s, yh, yt in product(A.domains.get_all(), N.domains.get_all(), M.domains.get_all(), S.domains.get_all(), YH.domains.get_all(), YT.domains.get_all()): if yh.name != yt.name: p = mapping[ get_loc(Event({ A: a, M: m, N: n, S: s, YH: yh, YT: yt })), 0] train.random_assign( Event({ name: yh, 'A': a, 'M': m, 'N': n, 'S': s }), Event({name + '1': yt}), p) test.random_assign( Event({ name: yh, 'A': a, 'M': m, 'N': n, 'S': s }), Event({name + '1': yt}), p) train.df[name] = train.df[name + '1'] train.df.drop([name + '1'], axis=1) test.df[name] = test.df[name + '1'] test.df.drop([name + '1'], axis=1) acc.append(accuracy_score(train.df['Y'], train.df[name])) acc.append(accuracy_score(test.df['Y'], test.df[name])) acc_matrix.iloc[:, 3] = acc train.df.to_csv('temp/synthetic_train_prediction3.csv', index=False) test.df.to_csv('temp/synthetic_test_prediction3.csv', index=False)
def test_init(s): dataset = DataSet(s) assert dataset assert len(dataset.id) > len(s)
def analyse_vocab(rouge, datasets=None, topics=None): if datasets is None: return concept_type = ("parse", "ngrams") embedding_variants = ("google.neg.300d", "glove.6B.300d", "tudarmstadt_german") # concept_type = None topic_details = [] concept_details = [] # else: # embeddings = load_w2v_embeddings(embeddings_path, language, oracle) token_details = [] embeddings_path = path.normpath(path.join(args.iobasedir, "embeddings")) for dataset, concept_type, embedding_variant in itertools.product(datasets, concept_type, embedding_variants): print("running analysis for ", dataset, concept_type, embedding_variant, "--------------------------------------") i = 0 ds = resolve_against_iobase(dataset, args.iobasedir) d = DataSet(ds) language = d.get_language() embeddings = load_w2v_by_name(embeddings_path, variant=embedding_variant) for topic in d.get_topics(): # if i > 2: # continue sumewrap = SumeWrap(language=language) i += 1 docs = topic.get_docs() summaries = topic.get_models() parse_info = topic.get_parse_info(0) sf = SimulatedFeedback(language, rouge, embeddings=embeddings, docs=docs, models=summaries, summary_length=100, oracle_type="active_learning", ub_score=(1, 1, 1), ub_summary=" ", parser_type=concept_type) # sf.run_full_simulation(max_iteration_count=0) doc_sentences = sf.summarizer.sentences summaries_parse_info = [list(topic.get_models(parsed=True)), list(topic.get_models(parsed=True))] if concept_type is "parse": sumewrap.s.sentences = sumewrap.load_sume_sentences(summaries, parse_type=concept_type, parse_info=list(summaries_parse_info)) sumewrap.s.extract_ngrams2(concept_type="phrase") else: sumewrap.s.sentences = sumewrap.load_sume_sentences(summaries) sumewrap.s.extract_ngrams2() sumewrap.s.compute_document_frequency() model_sentences = sumewrap.s.sentences # # token_details # for s in doc_sentences: sentence_pos = s.position doc_id = s.doc_id token_from_summary = False token_from_document = True for concept in s.concepts: ngrams = concept.split(' ') for token in ngrams: pos = "UNK" try: word, pos = s.tokens_pos[token].split('::') except: token = re.sub('[-\.](\s|$)', '\\1', token) try: word, pos = s.tokens_pos[concept].split('::') except: word, pos = token, 'NN' token_details.append({ "sentence_pos": sentence_pos, "doc_id": doc_id, "topic": topic.get_name(), "dataset": d.get_name(), "language": d.get_language(), "token": token, "word": word, "pos_tag": pos, "from_summary": token_from_summary, "from_document": token_from_document, "concept_type": concept_type, "embedding_variant": embedding_variant, "token_has_embedding": embeddings.isKnown(token), "word_has_embedding": embeddings.isKnown(word) }) for s in model_sentences: sentence_pos = s.position doc_id = s.doc_id token_from_summary = True token_from_document = False for concept in s.concepts: ngrams = concept.split(' ') for token in ngrams: pos = "UNK" try: word, pos = s.tokens_pos[token].split('::') except: token = re.sub('[-\.](\s|$)', '\\1', token) try: word, pos = s.tokens_pos[concept].split('::') except: word, pos = token, 'NN' token_details.append({ "sentence_pos": sentence_pos, "doc_id": doc_id, "topic": topic.get_name(), "dataset": d.get_name(), "language": d.get_language(), "token": token, "word": word, "pos_tag": pos, "from_summary": token_from_summary, "from_document": token_from_document, "concept_type": concept_type, "embedding_variant": embedding_variant, "token_has_embedding": embeddings.isKnown(token), "word_has_embedding": embeddings.isKnown(word) }) # post-process token details token_df = pd.DataFrame(token_details) # token_df.groupby("dataset") # print(token_df.head()) filename = "C:\\Users\\hatieke\\.ukpsummarizer\\tmp\\tokens_new.csv" print("saving token_df to ", filename) token_df.to_csv(filename, encoding="UTF-8")
if args.pickleout is None: pickleout=None else: pickleout = resolve_filename(args.pickleout.replace("\"",""), base=iobasedir) runner.single_iteration(picklein=picklein, pickleout=pickleout, feedbacks=js) elif args.command == 'summarize': # check if the path refers to a dataset, a topic or a sole model: queue = [] f = utils.reader.resolve_against_iobase(args.file, iobasedir) if path.exists(path.join(f, "index.json")): # is_dataset d = DataSet(f) # unroll to get topics for t in d.get_topics(): for (mf, mt) in t.get_models(): mf = path.normpath(mf) pref = path.commonprefix([mf, iobasedir]) tn = mf[len(pref) + 1:] print("shortened:", tn) queue.append(mf) # topics.append([t.get_name for t in d.get_topics()]) elif path.exists(path.join(f, "task.json")): # is topic t = Topic(f) for (mf, mt) in t.get_models():