Example #1
0
def process_my_own_algo_db(db_loc, epsilon, batch_id=None):
    """
    Takes a synthetic database generated by the my_own_algo algorithm and
    generates error statistics to be used in further analyses.

    Copies the synthetic database into the data/test_output folder (in a
    batch if specified) and calculates errors compared to the original
    database. Also generates a summary file with the same format as
    kendall analyses, however since there is no parameter `k`, leave
    that field as 0.
    """

    with open(params.attribute_file, 'r') as file:
        attr_list = [attr.strip() for attr in file.readlines()]

    # Create relevant directories and copy file
    batch_dir = get_batch_directory(batch_id)
    timestamp = datetime.now().strftime(TIMESTAMP_FMT)
    epsilon_str = str(epsilon).replace('.', '-')
    data_dir = f'{batch_dir}/my_own_algo_{timestamp}_{epsilon_str}'
    makedirs(data_dir, exist_ok=True)

    db_file = f'{data_dir}/synthetic_data.csv'
    if not path.exists(db_file):
        copy(db_loc, db_file)

    # Load the synthetic data. This algorithm formats the database as a
    # list of binary attributes with values of 1 if the row has the
    # attribute and 0 otherwise.
    synthetic_data = np.genfromtxt(db_file, dtype=int, delimiter=',')

    original_db = Database()
    original_db.load_from_file(params.input_data_file, params.attribute_file)

    abs_errors = []
    rel_errors = []
    with open(f'{data_dir}/histogram_comparison.csv', 'w+') as file:
        file.write('Attribute,Original count,Synthetic count,'
                   'Absolute error,Relative Error\n')
        for index, attr in enumerate(attr_list):
            original = original_db.one_way_histogram[attr]
            synthetic = sum(synthetic_data[:, index])
            abs_error = abs(synthetic - original)
            rel_error = round(abs_error / original * 100, 2)
            file.write(f'{attr},{original},{synthetic},{abs_error},'
                       f'{rel_error}%\n')

            abs_errors.append(abs_error)
            rel_errors.append(rel_error)

    with open(f'{data_dir}/summary.txt', 'w+') as file:
        file.write('epsilon,k,avg absolute error,avg relative error,'
                   'median absolute error,median relative error\n')
        file.write(f'{epsilon},0,{np.mean(abs_errors)},'
                   f'{np.mean(rel_errors)},{np.median(abs_errors)},'
                   f'{np.median(rel_errors)}\n')
    def test_get_marginal_histogram_shorter(self):
        short_db = Database()
        short_db.load_from_file('data/auxiliary/short_orig.csv',
                                'data/input/adult_columns.csv')
        print(short_db.data)
        print(short_db.numerical_attr_table)

        marginal = short_db.get_marginal_histogram(0)
        print(marginal)
        dp_marginals = get_dp_marginals(short_db, 1)
        print(dp_marginals[0])
Example #3
0
    def setup_files(self):
        """
        Copies all data files required for the experiment into
        the experiment folder.

        Generates new Kendall data but reuses my_own_algo data as it takes
        much longer to generate.

        Copies files for all values of epsilon.
        """

        super().setup_files()

        for file in os.listdir(self.data_dir):
            shutil.copy(f'{self.data_dir}/{file}', f'{self.exp_path}/{file}')

        for file in os.listdir(self.attr_dir):
            shutil.copy(f'{self.attr_dir}/{file}', f'{self.exp_path}/{file}')

        original_db = Database()
        original_db.load_from_file(self.original_data_src,
                                   f'{self.exp_path}/adult_columns.csv')
        for epsilon in self.epsilons:
            eps1 = 8 / 9 * epsilon
            eps2 = 1 / 9 * epsilon
            kendall_data = kendall_algorithm(original_db, eps1, eps2)
            kendall_db = Database(kendall_data, original_db.attr_table)
            kendall_db.save_to_file(f'{self.exp_path}/'
                                    f'kendall_synthetic_{epsilon}.csv')
Example #4
0
def single_analysis(epsilon, k=8, batch_id=None, replace=False):
    """
    Generates a single set of synthetic data and a one-way histogram
    count of all attribute values.

    epsilon:    the total privacy parameter for the algorithm
    k:          the ratio of epsilon1 to epsilon2
    batch_id:   a str identifier to organise different analyses
    replace:    should replace analyses with same epsilon value

    Results are saved into 'DPCopula/data/test_output/' with each
    analysis with the same batch_id being saved in a batch_[batch_id]
    folder.
    """

    # Pick batch folder
    batch_dir = get_batch_directory(batch_id)
    makedirs(batch_dir, exist_ok=True)

    if replace:
        for data_dir in listdir(batch_dir):
            if 'kendall' not in data_dir:
                continue
            eps = float(data_dir.split('_')[2].replace('-', '.'))
            if round(eps, 6) == round(epsilon, 6):
                rmtree(f'{batch_dir}/{data_dir}')

    epsilon = round(epsilon, 6)  # Ensure no floating point errors
    epsilon1 = k / (k + 1) * epsilon
    epsilon2 = 1 / (k + 1) * epsilon

    epsilon_str = str(epsilon).replace('.', '-')
    timestamp = datetime.now().strftime(TIMESTAMP_FMT)
    output_dir = f'{batch_dir}/kendall_{timestamp}_{epsilon_str}'
    makedirs(output_dir, exist_ok=True)

    print(f'Running DPCopula-Kendall\tε = {epsilon}')

    # Load data and create DP synthetic data
    original_db = Database()
    original_db.load_from_file(params.input_data_file, params.attribute_file)

    synthetic_data = kendall_algorithm(original_db, epsilon1, epsilon2)
    synthetic_db = Database(synthetic_data, original_db.attr_table)

    synthetic_db.save_to_file(f'{output_dir}/synthetic_data.csv')

    # Calculate error statistics
    abs_errors, rel_errors = compare_databases(original_db, synthetic_db,
                                               output_dir)

    # Calculate summary statistics
    with open(f'{output_dir}/summary.txt', 'w+') as file:
        file.write('epsilon,k,avg absolute error,avg relative error,'
                   'median absolute error,median relative error\n')
        file.write(f'{epsilon},{k},{np.mean(abs_errors)},'
                   f'{np.mean(rel_errors)},{np.median(abs_errors)},'
                   f'{np.median(rel_errors)}\n')
Example #5
0
    def generate_two_way_histogram(self, database_file):
        """
        Generates a two-way histogram for a single synthetic database
        specified by the name of the algorithm used and the value of
        epsilon.

        This operates identically to the function generating the one-way
        histogram except the output file will have the format
        '[attribute value 1],[attribute value 2],[count]'.
        """

        database_type = database_file.split('_')[0]

        if 'original' == database_type:
            db = Database()
            db.load_from_file(f'{self.exp_path}/{database_file}.csv',
                              f'{self.exp_path}/adult_columns.csv')

            two_way_hist = db.generate_two_way_hist()

            with open(f'{self.exp_path}/original_histogram.csv', 'w+') as file:
                for pair, count in two_way_hist:
                    key1 = Database.avp_to_key(pair[0])
                    key2 = Database.avp_to_key(pair[1])

                    file.write(f'{key1},{key2},{count}\n')

        elif 'kendall' == database_type:
            epsilon = database_file.split('_')[2]

            db = Database()
            db.load_from_file(f'{self.exp_path}/{database_file}.csv',
                              f'{self.exp_path}/adult_columns.csv')

            two_way_hist = db.generate_two_way_hist()

            with open(f'{self.exp_path}/kendall_histogram_{epsilon}.csv',
                      'w+') as file:
                for pair, count in two_way_hist:
                    key1 = Database.avp_to_key(pair[0])
                    key2 = Database.avp_to_key(pair[1])

                    file.write(f'{key1},{key2},{count}\n')

        elif 'my_own_algo' == database_type:
            epsilon = database_file.split('_')[2]

            reordered_data = self.reorder_my_own_algo_data(database_file)

            col_file = f'{self.exp_path}/adult_columns.csv'
            attr_table = Database.get_attrs_from_file(col_file)

            attr_val_pairs = [(attr, val) for attr in attr_table
                              for val in attr_table[attr]]

            with open(col_file, 'r') as file:
                attr_list = [attr.strip() for attr in file.readlines()]

            with open(f'{self.exp_path}/my_own_algo_histogram_{epsilon}.csv',
                      'w+') as file:
                for avp1 in attr_val_pairs:
                    for avp2 in attr_val_pairs:
                        if avp1[0] >= avp2[0]:
                            continue

                        key1 = Database.avp_to_key(avp1)
                        key2 = Database.avp_to_key(avp2)

                        col1 = attr_list.index(key1)
                        col2 = attr_list.index(key2)

                        count = sum(reordered_data[:, col1]
                                    & reordered_data[:, col2])

                        file.write(f'{key1},{key2},{count}\n')
Example #6
0
    def generate_one_way_histogram(self, database_file):
        """
        Generates a one-way histogram for a specific data file.

        Parameters:
            database_file = exact filename of the data file without the
                            extension
        This assumes the database file has either of the following formats:
            [algorithm]_synthetic_[epsilon].csv
            original_data.csv

        The file used will have the following format:
        data/experiments/[folder_name]/[database_file].csv

        The histogram will be stored in
        [algorithm]_histogram_[epsilon].csv if the database is synthetic
        or original_histogram.csv if it is the original database. Each
        line has the format '[attribute value],[count]'.

        If the database is my_own_algo synthetic, the columns will be
        reordered as to generate the same histogram order as the
        original and Kendall synthetic data produces.
        """

        database_type = database_file.split('_')[0]

        col_file = f'{self.exp_path}/adult_columns.csv'
        with open(col_file, 'r') as file:
            attr_list = [attr.strip() for attr in file.readlines()]

        if database_type == 'original':
            db = Database()
            db.load_from_file(f'{self.exp_path}/{database_file}.csv',
                              f'{self.exp_path}/adult_columns.csv')

            one_way_hist = db.generate_one_way_hist()

            with open(f'{self.exp_path}/original_histogram.csv', 'w+') as file:
                for attr in attr_list:
                    file.write(f'{attr},{one_way_hist[attr]}\n')

        elif database_type == 'kendall':
            epsilon = database_file.split('_')[2]
            db = Database()
            db.load_from_file(f'{self.exp_path}/{database_file}.csv',
                              f'{self.exp_path}/adult_columns.csv')

            one_way_hist = db.generate_one_way_hist()

            with open(f'{self.exp_path}/kendall_histogram_{epsilon}.csv',
                      'w+') as file:
                for attr in attr_list:
                    file.write(f'{attr},{one_way_hist[attr]}\n')

        elif database_type == 'my_own_algo':
            epsilon = database_file.split('_')[2]

            reordered_data = self.reorder_my_own_algo_data(database_file)

            with open(f'{self.exp_path}/my_own_algo_histogram_{epsilon}.csv',
                      'w+') as file:
                for index, attr in enumerate(attr_list):
                    file.write(f'{attr},{sum(reordered_data[:, index])}\n')
Example #7
0
 def setUpClass(cls):
     cls.db = Database()
     cls.db.load_from_file(params.input_data_file, params.attribute_file)