def test_str_to_list(): iva = '1,3,4,5' res = str_to_list(iva) assert res == ['1', '3', '4', '5'] iva = 'name,age,weight,height' res = str_to_list(iva) assert res == ['name', 'age', 'weight', 'height']
def main(): parser = argparse.ArgumentParser( description='Evaluate the utility of synthesized dataset compared with ' 'the source dataset.', formatter_class=CustomFormatter, add_help=False) # positional arguments parser.add_argument('source', help='set file path of source (raw) dataset to be ' 'compared with synthesized dataset, only support ' 'CSV files') parser.add_argument( 'target', help='set file path of target (synthesized) dataset to ' 'evaluate') # optional arguments group = parser.add_argument_group('general arguments') group.add_argument("-h", "--help", action="help", help="show this help message and exit") group.add_argument('--na-values', metavar='LIST', help='set additional values to recognize as NA/NaN; (' 'default null values are from pandas.read_csv)') group.add_argument('-o', '--output', metavar='FILE', default='report.html', help='set output path for evaluation report; (default ' 'is "report.html" under current work directory)') group = parser.add_argument_group('advanced arguments') group.add_argument('--category', metavar='LIST', help='set categorical columns separated by a comma.') group.add_argument( '-t', '--test', help='set test dataset for classification or regression ' 'task; (default take 20%% from source dataset)') group.add_argument( '--class-label', metavar='LIST', help='set column name as class label for classification ' 'or regression task; supports one or multiple ' 'columns (separated by comma)') args = parser.parse_args() start = time.time() na_values = str_to_list(args.na_values) class_labels = str_to_list(args.class_label) categories = str_to_list(args.category) # check kinds of parameters args.output = os.path.join(os.getcwd(), args.output) # if output folder not exists, then create it. if not os.path.exists(os.path.dirname(args.output)): os.makedirs(os.path.dirname(args.output)) def complement(attrs, full): return set(attrs or []) - set(full) # Initialization task: source = read_data_from_csv(args.source, na_values=na_values, header='infer') target = read_data_from_csv(args.target, na_values=na_values, header='infer') test = read_data_from_csv(args.test) if args.test is not None else None comp = complement(class_labels, source.columns) if comp: parser.exit( message=f'--class-label(s): {comp} are not in source file.') comp = complement(class_labels, target.columns) if comp: parser.exit( message=f'--class-label(s): {comp} are not in target file.') frame = BiFrame(source, target, categories=categories) frame.to_html(buffer=args.output, title='Data Utility Evaluation Report', labels=class_labels, test=test) duration = time.time() - start print(f'Evaluate dataset {args.source} and {args.target} and generate ' f'report at {args.output} in {round(duration, 2)} seconds.')
def main(): parser = argparse.ArgumentParser( description='Synthesize one dataset by differential privacy', formatter_class=CustomFormatter, add_help=False) parser.add_argument('file', help='set path of a csv file to be synthesized ' 'or path of a pattern file to be generated') # optional arguments group = parser.add_argument_group('general arguments') group.add_argument("-h", "--help", action="help", help="show this help message and exit") group.add_argument('--pseudonym', metavar='LIST', help='set candidate columns separated by a comma, which ' 'will be replaced with a pseudonym. It only works ' 'on the string column.') group.add_argument('--delete', metavar='LIST', help='set columns separated by a comma, which will be ' 'deleted when synthesis.') group.add_argument('--na-values', metavar='LIST', help='set additional values to recognize as NA/NaN; ' '(default null values are from pandas.read_csv)') group.add_argument('-o', '--output', metavar='FILE', help="set the file name of output synthesized dataset (" "default is input file name with suffix '-a.csv')") group.add_argument('--no-header', action='store_true', help='indicate there is no header in a CSV file, and ' 'will take [#0, #1, #2, ...] as header. (default: ' 'the tool will try to detect and take actions)') group.add_argument('--records', metavar='INT', type=int, help='specify the records you want to generate; default ' 'is the same records with the original dataset') group.add_argument('--sep', metavar='STRING', default=',', help='specify the delimiter of the input file') group = parser.add_argument_group('advanced arguments') group.add_argument('-e', '--epsilon', metavar='FLOAT', type=float, help='set epsilon for differential privacy (default 0.1)', default=0.1) group.add_argument('--category', metavar='LIST', help='set categorical columns separated by a comma.') group.add_argument('--retain', metavar='LIST', help='set columns to retain the values') args = parser.parse_args() start = time.time() pseudonyms = str_to_list(args.pseudonym) deletes = str_to_list(args.delete) categories = str_to_list(args.category) na_values = str_to_list(args.na_values) retains = str_to_list(args.retain) header = None if args.no_header else 'infer' # check the file type from its extension is_pattern = ends_with_json(args.file) if is_pattern: if retains is not None and len(retains) != 0: parser.exit(message='Do not support --retain option when ' 'synthesize from pattern file.') # construct DataSet from pattern file dataset = DataSet.from_pattern(args.file) else: data = read_data_from_csv(args.file, na_values=na_values, header=header, sep=args.sep) def complement(attrs, full): return set(attrs or []) - set(full) # check parameters: pseudonyms, deletes, categories comp = complement(pseudonyms, data.columns) if comp: parser.exit( message=f'--pseudonym columns: {comp} are not in csv file.') comp = complement(deletes, data.columns) if comp: parser.exit( message=f'--delete columns: {comp} are not in csv file.') comp = complement(categories, data.columns) if comp: parser.exit( message=f'--category columns: {comp} are not in csv file.') dataset = DataSet(data, categories=categories) synthesized = dataset.synthesize(epsilon=args.epsilon, pseudonyms=pseudonyms, deletes=deletes, retains=retains, records=args.records) if args.output is None: name = file_name(args.file) args.output = f'{name}-a.csv' synthesized.to_csv(args.output, index=False, sep=args.sep) duration = time.time() - start print(f'Synthesize from {args.file} to file {args.output} in ' f'{round(duration, 2)} seconds.')
def main(): parser = argparse.ArgumentParser( description='Serialize patterns of a dataset anonymously', formatter_class=CustomFormatter, add_help=False) parser.add_argument('file', help='set path of a csv file to be patterned ' 'anonymously') # optional arguments group = parser.add_argument_group('general arguments') group.add_argument("-h", "--help", action="help", help="show this help message and exit") group.add_argument('--pseudonym', metavar='LIST', help='set candidate columns separated by a comma, which ' 'will be replaced with a pseudonym. It only works ' 'on the string column.') group.add_argument('--delete', metavar='LIST', help='set columns separated by a comma, which will be ' 'deleted when synthesis.') group.add_argument('--na-values', metavar='LIST', help='set additional values to recognize as NA/NaN; ' '(default null values are from pandas.read_csv)') group.add_argument('-o', '--output', metavar='FILE', help="set the file name of anonymous patterns (default " "is input file name with a suffix '-pattern.json')") group.add_argument('--no-header', action='store_true', help='indicate there is no header in a CSV file, and ' 'will take [#0, #1, #2, ...] as header. (default: ' 'the tool will try to detect and take actions)') group.add_argument('--sep', metavar='STRING', help='specify the delimiter of the input file') group = parser.add_argument_group('advanced arguments') group.add_argument('-e', '--epsilon', metavar='FLOAT', type=float, help='set epsilon for differential privacy (default 0.1)', default=0.1) group.add_argument('--category', metavar='LIST', help='set categorical columns separated by a comma.') args = parser.parse_args() start = time.time() pseudonyms = str_to_list(args.pseudonym) deletes = str_to_list(args.delete) categories = str_to_list(args.category) na_values = str_to_list(args.na_values) header = None if args.no_header else 'infer' sep = ',' if args.sep is None else args.sep data = read_data_from_csv(args.file, na_values=na_values, header=header, sep=sep) def complement(attrs, full): return set(attrs or []) - set(full) # check parameters: pseudonyms, deletes, categories comp = complement(pseudonyms, data.columns) if comp: parser.exit(message=f'--pseudonym columns: {comp} are not in csv file.') comp = complement(deletes, data.columns) if comp: parser.exit(message=f'--delete columns: {comp} are not in csv file.') comp = complement(categories, data.columns) if comp: parser.exit(message=f'--category columns: {comp} are not in csv file.') dataset = DataSet(data, categories=categories) if args.output is None: name = file_name(args.file) args.output = f'{name}-pattern.json' dataset.to_pattern(path=args.output, epsilon=args.epsilon, deletes=deletes, pseudonyms=pseudonyms, retains=[]) duration = time.time() - start print(f'Analyze and serialize the patterns of {args.file} at {args.output} ' f'in {round(duration, 2)} seconds.')
def main(): parser = argparse.ArgumentParser( description='Synthesize one dataset by Differential Privacy', formatter_class=CustomFormatter, add_help=False) parser.add_argument('file', help='set path of the CSV to be synthesized') # optional arguments group = parser.add_argument_group('general arguments') group.add_argument("-h", "--help", action="help", help="show this help message and exit") group.add_argument( '--pseudonym', metavar='LIST', help='set candidate columns separated by a comma, which will be ' 'replaced with a pseudonym. ' 'It only works on the string column.') group.add_argument('--delete', metavar='LIST', help='set columns separated by a comma, which will be ' 'deleted when synthesis.') group.add_argument('--na-values', metavar='LIST', help='set additional values to recognize as NA/NaN; ' '(default null values are from pandas.read_csv)') group.add_argument('-o', '--output', metavar='FILE', help="set the file name of output synthesized dataset (" "default is input file name with suffix '_a')") group.add_argument('--no-header', action='store_true', help='indicate there is no header in a CSV file, and ' 'will take [#0, #1, #2, ...] as header. (default: ' 'the tool will try to detect and take actions)') group = parser.add_argument_group('advanced arguments') group.add_argument( '-e', '--epsilon', metavar='FLOAT', type=float, help='set epsilon for differential privacy (default 0.1)', default=0.1) group.add_argument('--category', metavar='LIST', help='set categorical columns separated by a comma.') group.add_argument('--retain', metavar='LIST', help='set columns to retain the values') args = parser.parse_args() start = time.time() pseudonyms = str_to_list(args.pseudonym) deletes = str_to_list(args.delete) categories = str_to_list(args.category) na_values = str_to_list(args.na_values) retains = str_to_list(args.retain) header = None if args.no_header else 'infer' data = read_data_from_csv(args.file, na_values=na_values, header=header) def complement(attrs, full): return set(attrs or []) - set(full) # check parameters: pseudonyms, deletes, categories comp = complement(pseudonyms, data.columns) if comp: parser.exit( message=f'--pseudonym columns: {comp} are not in csv file.') comp = complement(deletes, data.columns) if comp: parser.exit(message=f'--delete columns: {comp} are not in csv file.') comp = complement(categories, data.columns) if comp: parser.exit(message=f'--category columns: {comp} are not in csv file.') dataset = DataSet(data, categories=categories) synthesized = dataset.synthesize(epsilon=args.epsilon, pseudonyms=pseudonyms, deletes=deletes, retains=retains) if args.output is None: name = file_name(args.file) args.output = f'{name}_a.csv' synthesized.to_csv(args.output, index=False) duration = time.time() - start print(f'Synthesized data {args.output} in {round(duration, 2)} seconds.')