Example #1
0
    def generate_dataset_in_random_mode(self,
                                        n,
                                        description_file,
                                        seed=0,
                                        minimum=0,
                                        maximum=100):
        set_random_seed(seed)
        description = read_json_file(description_file)

        self.synthetic_dataset = DataFrame()
        for attr in description['attribute_description'].keys():
            attr_info = description['attribute_description'][attr]
            datatype = attr_info['data_type']
            is_categorical = attr_info['is_categorical']
            is_candidate_key = attr_info['is_candidate_key']
            if is_candidate_key:
                self.synthetic_dataset[attr] = parse_json(
                    attr_info).generate_values_as_candidate_key(n)
            elif is_categorical:
                self.synthetic_dataset[attr] = random.choice(
                    attr_info['distribution_bins'], n)
            elif datatype == 'String':
                length = random.randint(attr_info['min'], attr_info['max'] + 1)
                self.synthetic_dataset[attr] = length
                self.synthetic_dataset[attr] = self.synthetic_dataset[
                    attr].map(lambda x: generate_random_string(x))
            else:
                if datatype == 'Integer':
                    self.synthetic_dataset[attr] = random.randint(
                        minimum, maximum + 1, n)
                else:
                    self.synthetic_dataset[attr] = random.uniform(
                        minimum, maximum, n)
Example #2
0
def get_plot_data(input_dataset_file, synthetic_dataset_file,
                  description_file):
    description = read_json_file(description_file)
    df_before = pd.read_csv(input_dataset_file)
    df_after = pd.read_csv(synthetic_dataset_file)
    plot_data = {'histogram': {}, 'barchart': {}, 'heatmap': {}}
    for attr in df_before:
        if description['attribute_description'][attr]['is_categorical']:
            bins_before, counts_before = get_barchart_data(df_before, attr)
            bins_after, counts_after = get_barchart_data(df_after, attr)
            plot_data['barchart'][attr] = {
                'before': {
                    'bins': bins_before,
                    'counts': counts_before
                },
                'after': {
                    'bins': bins_after,
                    'counts': counts_after
                }
            }
        elif description['attribute_description'][attr]['data_type'] in {
                'Integer', 'Float'
        }:
            plot_data['histogram'][attr] = {
                'before': get_histogram_data(df_before, attr),
                'after': get_histogram_data(df_after, attr)
            }

    plot_data['heatmap']['before'] = get_heatmap_data(input_dataset_file)
    plot_data['heatmap']['after'] = get_heatmap_data(synthetic_dataset_file)
    plot_file_name = input_dataset_file.replace(".csv", "_plot.json")
    with open(plot_file_name, 'w') as outfile:
        json.dump(plot_data, outfile, indent=4)
Example #3
0
    def generate_dataset_in_random_mode(self, n, description_file, seed=0):
        set_random_seed(seed)
        description = read_json_file(description_file)

        self.synthetic_dataset = pd.DataFrame()
        for attr in description['attribute_description'].keys():
            attr_description = description['attribute_description'][attr]
            datatype = attr_description['datatype']
            is_categorical = attr_description['is_categorical']
            if is_categorical:
                self.synthetic_dataset[attr] = np.random.choice(
                    attr_description['distribution_bins'], n)
            elif datatype == 'string':
                length = np.random.randint(attr_description['min'],
                                           attr_description['max'])
                self.synthetic_dataset[attr] = length
                self.synthetic_dataset[attr] = self.synthetic_dataset[
                    attr].map(lambda x: generate_random_string(x))
            else:
                minimum, maximum = attr_description['min'], attr_description[
                    'max']
                if datatype == 'int':
                    self.synthetic_dataset[attr] = np.random.randint(
                        minimum, maximum + 1, n)
                else:
                    self.synthetic_dataset[attr] = np.random.uniform(
                        minimum, maximum, n)
Example #4
0
    def generate_dataset_in_correlated_attribute_mode(self,
                                                      n,
                                                      description_file,
                                                      seed=0):
        set_random_seed(seed)
        self.n = n
        self.description = read_json_file(description_file)

        all_attributes = self.description['meta']['all_attributes']
        candidate_keys = set(self.description['meta']['candidate_keys'])
        self.encoded_dataset = DataGenerator.generate_encoded_dataset(
            self.n, self.description)
        self.synthetic_dataset = DataFrame(columns=all_attributes)
        for attr in all_attributes:
            attr_info = self.description['attribute_description'][attr]
            column = parse_json(attr_info)

            if attr in self.encoded_dataset:
                self.synthetic_dataset[
                    attr] = column.sample_values_from_binning_indices(
                        self.encoded_dataset[attr])
            elif attr in candidate_keys:
                self.synthetic_dataset[
                    attr] = column.generate_values_as_candidate_key(n)
            else:
                # for attributes not in BN or candidate keys, use independent attribute mode.
                binning_indices = column.sample_binning_indices_in_independent_attribute_mode(
                    n)
                self.synthetic_dataset[
                    attr] = column.sample_values_from_binning_indices(
                        binning_indices)
Example #5
0
def generateRanking(current_file, top_K=100):
    """
    Generate a ranking of input data.

    Attributes:
        current_file: file name that stored the data (with out ".csv" suffix)
        top_K: threshold of returned generated ranking
    Return:  json data of a dataframe that stored the generated ranking
    """
    ranks_file = current_file + "_rankings.json"
    rankings_paras = read_json_file(ranks_file)
    data = pd.read_csv(current_file + ".csv")
    # before compute the score, replace the NA in the data with 0
    filled_data = data.fillna(value=0)
    chosed_atts = rankings_paras["ranked_atts"]
    filled_data["GeneratedScore"] = 0
    for i in range(len(chosed_atts)):
        cur_weight = rankings_paras["ranked_atts_weight"][i]
        filled_data["GeneratedScore"] += cur_weight * filled_data[
            chosed_atts[i]]
    filled_data = filled_data.reindex_axis(
        ['GeneratedScore'] +
        list([a for a in filled_data.columns if a != 'GeneratedScore']),
        axis=1)
    # save data with weight sum to a csv on server
    filled_data.sort_values(by="GeneratedScore", ascending=False, inplace=True)
    filled_data.to_csv(current_file + "_weightsum.csv", index=False)
    # only show top_K rows in the UI
    display_data = filled_data.head(top_K)
    return display_data.to_json(orient='records')
Example #6
0
 def generate_dataset_in_correlated_attribute_mode(self,
                                                   n,
                                                   description_file,
                                                   seed=0):
     self.n = n
     set_random_seed(seed)
     self.description = read_json_file(description_file)
     self.encoded_dataset = DataGenerator.generate_encoded_dataset(
         self.n, self.description)
     self.sample_from_encoded_dataset()
Example #7
0
def computePvaluePairwise(att_name, att_value, current_file, round_default=2):
    """
    Compute p-value using Pairwise oracle

    Attributes:
        att_name: sensitive attribute name
        att_value: value of protected group of above attribute
        current_file: file name that stored the data (with out ".csv" suffix)
        run_time: running times of simulation using mergeUnfairRanking
        round_default: threshold of round function for the returned p-value
    Return:  rounded p-value
    """
    data = pd.read_csv(current_file + "_weightsum.csv")
    total_N = len(data)

    # for attribute value, compute the current pairs and estimated fair pairs
    position_lists_val = data[data[att_name] == att_value].index + 1
    size_vi = len(position_lists_val)

    fair_p_vi = size_vi / total_N

    # get the pre-computed pairwise results from simulation
    simu_data = read_json_file(
        "/home/ec2-user/dataResponsiblyUI/playdata/rankingfacts/SimulationPairs_N"
        + str(total_N) + "_R1000.json")
    # simu_data = read_json_file("./playdata/rankingfacts/SimulationPairs_N" + str(total_N) + "_R1000.json")

    all_fair_p = list(simu_data.keys())
    if str(fair_p_vi) in all_fair_p:
        cur_pi = str(fair_p_vi)
    else:
        diff_p = []
        for pi in all_fair_p:
            num_pi = float(pi)
            diff_p.append(abs(num_pi - fair_p_vi))

        min_diff_index = diff_p.index(min(diff_p))
        cur_pi = all_fair_p[min_diff_index]
    # compute the number of pairs of value > * in the input ranking that is stored in the current file
    pair_N_vi, estimated_fair_pair_vi, size_vi = computePairN(
        att_name, att_value, current_file)

    # compute the cdf, i.e. p-value of input pair value
    sample_pairs = simu_data[cur_pi]

    cdf_pair = Cdf(sample_pairs, pair_N_vi)
    # decide to use left tail or right tail
    # mode_pair_sim,_ = mode(sample_pairs)
    # median_mode = np.median(list(mode_pair_sim))
    # if pair_N_vi <= mode_pair_sim:
    #     p_value = cdf_pair
    # else:
    #     p_value = 1- cdf_pair
    return round(cdf_pair, round_default)
Example #8
0
    def describe_dataset_in_random_mode(
            self,
            dataset_file: str,
            attribute_to_datatype: Dict[str, DataType] = None,
            attribute_to_is_categorical: Dict[str, bool] = None,
            attribute_to_is_candidate_key: Dict[str, bool] = None,
            categorical_attribute_domain_file: str = None,
            numerical_attribute_ranges: Dict[str, List] = None,
            seed=0):
        attribute_to_datatype = attribute_to_datatype or {}
        attribute_to_is_categorical = attribute_to_is_categorical or {}
        attribute_to_is_candidate_key = attribute_to_is_candidate_key or {}
        numerical_attribute_ranges = numerical_attribute_ranges or {}

        if categorical_attribute_domain_file:
            categorical_attribute_to_domain = utils.read_json_file(
                categorical_attribute_domain_file)
        else:
            categorical_attribute_to_domain = {}

        utils.set_random_seed(seed)
        self.attr_to_datatype = {
            attr: DataType(datatype)
            for attr, datatype in attribute_to_datatype.items()
        }
        self.attr_to_is_categorical = attribute_to_is_categorical
        self.attr_to_is_candidate_key = attribute_to_is_candidate_key
        self.read_dataset_from_csv(dataset_file)
        self.infer_attribute_data_types()
        self.analyze_dataset_meta()
        self.represent_input_dataset_by_columns()

        for column in self.attr_to_column.values():
            attr_name = column.name
            if attr_name in categorical_attribute_to_domain:
                column.infer_domain(
                    categorical_domain=categorical_attribute_to_domain[
                        attr_name])
            elif attr_name in numerical_attribute_ranges:
                column.infer_domain(
                    numerical_range=numerical_attribute_ranges[attr_name])
            else:
                column.infer_domain()

        # record attribute information in json format
        self.data_description['attribute_description'] = {}
        for attr, column in self.attr_to_column.items():
            self.data_description['attribute_description'][
                attr] = column.to_json()
    def generate_dataset_in_independent_mode(self, n, description_file, seed=0):
        set_random_seed(seed)
        self.description = read_json_file(description_file)

        all_attributes = self.description['meta']['all_attributes']
        candidate_keys = set(self.description['meta']['candidate_keys'])
        self.synthetic_dataset = DataFrame(columns=all_attributes)
        for attr in all_attributes:
            attr_info = self.description['attribute_description'][attr]
            column = parse_json(attr_info)

            if attr in candidate_keys:
                self.synthetic_dataset[attr] = column.generate_values_as_candidate_key(n)
            else:
                binning_indices = column.sample_binning_indices_in_independent_attribute_mode(n)
                self.synthetic_dataset[attr] = column.sample_values_from_binning_indices(binning_indices)
Example #10
0
    def generate_dataset_in_correlated_attribute_mode(self,
                                                      n,
                                                      description_file,
                                                      seed=0):
        self.n = n
        set_random_seed(seed)
        self.description = read_json_file(description_file)
        self.encoded_dataset = DataGenerator.generate_encoded_dataset(
            self.n, self.description)

        # # use independent attribute mode for attributes ignored by BN, which are non-categorical strings.
        # for attr in self.description['meta']['attributes_ignored_by_BN']:
        #     attr_info = self.description['attribute_description'][attr]
        #     bins = attr_info['distribution_bins']
        #     probs = attr_info['distribution_probabilities']
        #     self.encoded_dataset[attr] = np.random.choice(list(range(len(bins))), size=n, p=probs)

        self.sample_from_encoded_dataset()
Example #11
0
def json_piechart_data(request):
    passed_data_name = request.session.get('passed_data_name')
    passed_running_data_flag = request.session.get("running_data")

    if passed_running_data_flag == "processed":
        cur_data_name = passed_data_name + "_norm"
        ranks_file = passed_data_name + "_norm_rankings.json"
    else:
        cur_data_name = passed_data_name
        ranks_file = passed_data_name + "_rankings.json"

    # read the parameter file in server that stores all the parameter inputs from user
    rankings_paras = read_json_file(ranks_file)
    checked_cate_atts = rankings_paras["checked_cate_atts"]
    piechart_json_data = get_chart_data(cur_data_name, checked_cate_atts)

    return HttpResponse(json.dumps(piechart_json_data),
                        content_type='application/json')
Example #12
0
    def generate_dataset_in_independent_mode(self,
                                             n,
                                             description_file,
                                             seed=0):
        set_random_seed(seed)
        self.description = read_json_file(description_file)

        attributes = self.description['meta']['attribute_list']
        self.encoded_dataset = pd.DataFrame(columns=attributes,
                                            index=list(range(n)))
        for attr in attributes:
            attr_info = self.description['attribute_description'][attr]
            bins = attr_info['distribution_bins']
            probs = attr_info['distribution_probabilities']
            self.encoded_dataset[attr] = np.random.choice(list(range(
                len(bins))),
                                                          size=n,
                                                          p=probs)

        self.sample_from_encoded_dataset()
Example #13
0
def get_categorical_attributes(plot_json_file):
    plot_data = read_json_file(plot_json_file)
    return list(plot_data['barchart'].keys())
Example #14
0
def nutrition_facts(request):
    passed_data_name = request.session.get('passed_data_name')
    passed_running_data_flag = request.session.get("running_data")
    # for previous step, return to corresponding parameter setting page
    unprocessed_flag = True

    if passed_running_data_flag == "processed":
        ranks_file = passed_data_name + "_norm_rankings.json"
        cur_data_name = passed_data_name + "_norm"
        unprocessed_flag = False
    else:
        ranks_file = passed_data_name + "_rankings.json"
        cur_data_name = passed_data_name
    # read the parameter file in server that stores all the parameter inputs from user
    rankings_paras = read_json_file(ranks_file)
    chosed_atts = rankings_paras["ranked_atts"]
    checked_sensi_atts = rankings_paras["checked_sensi_atts"]
    checked_cate_atts = rankings_paras["checked_cate_atts"]

    # get the choosed atts and its weight in the parameter file
    att_weights = {}
    for i in range(len(chosed_atts)):
        att_weights[chosed_atts[i]] = rankings_paras["ranked_atts_weight"][i]

    # get size of upload data
    total_n = getSizeOfRanking(cur_data_name)

    # compute statistics of top 10 and overall in generated ranking
    att_stats_topTen = compute_statistic_topN(chosed_atts, cur_data_name, 10)
    att_stats_all = compute_statistic_topN(chosed_atts, cur_data_name, total_n)
    # compute top 3 correlated attributes
    att_correlated = compute_correlation(cur_data_name)
    # set the correlation threshold
    low_coef_threshold = 0.25
    high_coef_threshold = 0.75
    # generate the json data for correlation table
    top3_correlated_attts = {}
    for ai in att_correlated:
        ai_coef = abs(ai[0])
        ai_name = ai[1]
        if ai_coef >= high_coef_threshold:
            top3_correlated_attts[ai_name] = [ai_coef, "high"]
        else:
            if ai_coef <= low_coef_threshold:
                top3_correlated_attts[ai_name] = [ai_coef, "low"]
            else:
                top3_correlated_attts[ai_name] = [ai_coef, "median"]

    # compute statistics of top 3 correlated attributes for detailed ingredients widget
    top_corre_atts = [att_correlated[i][1] for i in range(len(att_correlated))]
    corre_att_stats_topTen = compute_statistic_topN(top_corre_atts,
                                                    cur_data_name, 10)
    corre_att_stats_all = compute_statistic_topN(top_corre_atts, cur_data_name,
                                                 total_n)

    # compute the slope of generated scores at a specified top-k
    # set the slope threshold for stability
    slope_threshold = 0.25
    if total_n >= 100:
        slope_top_ten = computeSlopeOfScores(cur_data_name, 10)
        slope_top_hundred = computeSlopeOfScores(cur_data_name, 100)
        stable_ten = abs(slope_top_ten) <= slope_threshold
        stable_hundred = abs(slope_top_hundred) <= slope_threshold
        stable_res = {"Top-10": stable_ten, "Top-100": stable_hundred}
        slope_overall = "false"
    else:
        if total_n >= 10:
            slope_top_ten = computeSlopeOfScores(cur_data_name, 10)
            slope_overall = computeSlopeOfScores(cur_data_name, total_n)
            stable_ten = abs(slope_top_ten) <= slope_threshold
            stable_overall = abs(slope_overall) <= slope_threshold
            slope_top_hundred = "NA"
            stable_res = {"Top-10": stable_ten, "Overall": stable_overall}
        else:
            slope_top_ten = "NA"
            slope_top_hundred = "NA"
            slope_overall = "false"
            stable_res = {}

    # run the fairness validation for three oracles
    fair_all_oracles, fair_res_oracles, alpha_default, top_K = runFairOracles(
        checked_sensi_atts, cur_data_name)

    checked_cate_att_ids = [
        "att" + str(i) for i in range(len(checked_cate_atts))
    ]
    # compute the number of pir charts
    pie_n = len(checked_cate_att_ids)
    row_n = int(np.ceil(pie_n / 2))
    place_n = int(5 + (row_n - 1) * 2)
    split_n = int(row_n * 2 + 1)

    context = {
        'passed_data_name': passed_data_name,
        "passed_att_weights": att_weights,
        "passed_att_stats_topTen": att_stats_topTen,
        "passed_att_stats_all": att_stats_all,
        "passed_att_correlated": top3_correlated_attts,
        "passed_unprocessing_flag": unprocessed_flag,
        "corre_att_stats_topTen": corre_att_stats_topTen,
        "corre_att_stats_all": corre_att_stats_all,
        "passed_fair_all_oracles": fair_all_oracles,
        "passed_fair_res_oracles": fair_res_oracles,
        "passed_slope_ten": slope_top_ten,
        "passed_slope_hundred": slope_top_hundred,
        "passed_stable_res": stable_res,
        "passed_slope_threshold": slope_threshold,
        "passed_alpha_default": alpha_default,
        "passed_coef_high": high_coef_threshold,
        "passed_top_k": top_K,
        "passed_coef_low": low_coef_threshold,
        "passed_slope_overall": slope_overall,
        "passed_pie_att_ids": checked_cate_att_ids,
        "passed_pie_atts": checked_cate_atts,
        "passed_range_place": range(place_n),
        "passed_range_split": range(split_n),
    }
    return render(request, 'rankingfacts/ranking_facts_widget_boot.html',
                  context)
Example #15
0
def get_drawable_attributes(plot_json_file):
    plot_data = read_json_file(plot_json_file)
    return list(plot_data['barchart'].keys()) + list(
        plot_data['histogram'].keys())
Example #16
0
def res_json_processing_plot(request):
    passed_data_name = request.session.get('passed_data_name')
    description_file = passed_data_name + "_plot.json"
    plot_json = read_json_file(description_file)
    return HttpResponse(json.dumps(plot_json), content_type='application/json')
Example #17
0
def generate_data(username):
    configuration = read_json_file('{}_parameters.json'.format(username))
    input_dataset_file = '{}.csv'.format(username)
    description_file = '{}_description.json'.format(username)
    synthetic_dataset_file = '{}_synthetic_data.csv'.format(username)

    initial_dataset_info = get_dataset_info(input_dataset_file)

    attribute_to_is_candidate = {}
    for attr in initial_dataset_info['attribute_list']:
        if attr in configuration['candidate_atts']:
            attribute_to_is_candidate[attr] = True
        else:
            attribute_to_is_candidate[attr] = False

    attribute_to_is_categorical = {}
    for attr in initial_dataset_info['attribute_list']:
        if attr in configuration['categorical_atts']:
            attribute_to_is_categorical[attr] = True
        else:
            attribute_to_is_categorical[attr] = False

    if configuration['tuple_n'] == '':
        n = initial_dataset_info['number_of_tuples']
    else:
        n = int(configuration['tuple_n'])

    # if configuration['categorical_threshold'] == '':
    #     categorical_threshold = 10
    # else:
    #     categorical_threshold = int(configuration['categorical_threshold'])

    if configuration['seed'] == '':
        seed = 0
    else:
        seed = int(configuration['seed'])

    generator = DataGenerator()
    if configuration['chose_mode'] == 'mode1':
        describer = DataDescriber()
        describer.describe_dataset_in_random_mode(input_dataset_file, {},
                                                  attribute_to_is_categorical,
                                                  attribute_to_is_candidate,
                                                  seed)
        describer.save_dataset_description_to_file(description_file)
        generator.generate_dataset_in_random_mode(n, description_file, seed)
    else:

        if configuration['histogram_size'] == '':
            histogram_size = 20
        else:
            histogram_size = int(configuration['histogram_size'])

        if configuration['epsilon'] == '':
            epsilon = 10
        else:
            epsilon = float(configuration['epsilon'])

        attribute_to_datatype = configuration['type_atts']

        describer = DataDescriber(histogram_size)
        if configuration['chose_mode'] == 'mode2':
            describer.describe_dataset_in_independent_attribute_mode(
                input_dataset_file, epsilon, attribute_to_datatype,
                attribute_to_is_categorical, attribute_to_is_candidate, seed)
            describer.save_dataset_description_to_file(description_file)
            generator.generate_dataset_in_independent_mode(
                n, description_file, seed)
        elif configuration['chose_mode'] == 'mode3':
            if configuration['max_degree'] == '':
                max_degree = 3
            else:
                max_degree = int(configuration['max_degree'])

            describer.describe_dataset_in_correlated_attribute_mode(
                input_dataset_file, max_degree, epsilon, attribute_to_datatype,
                attribute_to_is_categorical, attribute_to_is_candidate, seed)
            describer.save_dataset_description_to_file(description_file)
            generator.generate_dataset_in_correlated_attribute_mode(
                n, description_file, seed)

    generator.save_synthetic_data(synthetic_dataset_file)
Example #18
0
def norm_json_processing_hist(request):
    # NOTICE: input name need to update to _norm version for processing data
    passed_data_name = request.session.get('passed_data_name')
    description_file = passed_data_name + "_norm_plot.json"
    plot_json = read_json_file(description_file)
    return HttpResponse(json.dumps(plot_json), content_type='application/json')