def main():
    parser = argparse.ArgumentParser(
        description='Evaluate the utility of synthesized dataset compared with '
        'the source dataset.',
        formatter_class=CustomFormatter,
        add_help=False)
    # positional arguments
    parser.add_argument('source',
                        help='set file path of source (raw) dataset to be '
                        'compared with synthesized dataset, only support '
                        'CSV files')
    parser.add_argument(
        'target',
        help='set file path of target (synthesized) dataset to '
        'evaluate')

    # optional arguments
    group = parser.add_argument_group('general arguments')
    group.add_argument("-h",
                       "--help",
                       action="help",
                       help="show this help message and exit")
    group.add_argument('--na-values',
                       metavar='LIST',
                       help='set additional values to recognize as NA/NaN; ('
                       'default null values are from pandas.read_csv)')
    group.add_argument('-o',
                       '--output',
                       metavar='FILE',
                       default='report.html',
                       help='set output path for evaluation report; (default '
                       'is "report.html" under current work directory)')

    group = parser.add_argument_group('advanced arguments')
    group.add_argument('--category',
                       metavar='LIST',
                       help='set categorical columns separated by a comma.')
    group.add_argument(
        '-t',
        '--test',
        help='set test dataset for classification or regression '
        'task; (default take 20%% from source dataset)')
    group.add_argument(
        '--class-label',
        metavar='LIST',
        help='set column name as class label for classification '
        'or regression task; supports one or multiple '
        'columns (separated by comma)')

    args = parser.parse_args()
    start = time.time()

    na_values = str_to_list(args.na_values)
    class_labels = str_to_list(args.class_label)
    categories = str_to_list(args.category)

    # check kinds of parameters
    args.output = os.path.join(os.getcwd(), args.output)
    # if output folder not exists, then create it.
    if not os.path.exists(os.path.dirname(args.output)):
        os.makedirs(os.path.dirname(args.output))

    def complement(attrs, full):
        return set(attrs or []) - set(full)

    # Initialization task:
    source = read_data_from_csv(args.source,
                                na_values=na_values,
                                header='infer')
    target = read_data_from_csv(args.target,
                                na_values=na_values,
                                header='infer')
    test = read_data_from_csv(args.test) if args.test is not None else None

    comp = complement(class_labels, source.columns)
    if comp:
        parser.exit(
            message=f'--class-label(s): {comp} are not in source file.')
    comp = complement(class_labels, target.columns)
    if comp:
        parser.exit(
            message=f'--class-label(s): {comp} are not in target file.')

    frame = BiFrame(source, target, categories=categories)
    frame.to_html(buffer=args.output,
                  title='Data Utility Evaluation Report',
                  labels=class_labels,
                  test=test)

    duration = time.time() - start
    print(f'Evaluate dataset {args.source} and {args.target} and generate '
          f'report at {args.output} in {round(duration, 2)} seconds.')
def test_read_data_from_csv():
    from pandas import DataFrame
    from .testdata import adult_with_head, adult_with_head_res
    import io
    data = read_data_from_csv(io.StringIO(adult_with_head))
    assert data.equals(DataFrame(adult_with_head_res)) is True
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(
        description='Synthesize one dataset by differential privacy',
        formatter_class=CustomFormatter,
        add_help=False)
    parser.add_argument('file', help='set path of a csv file to be synthesized '
                                     'or path of a pattern file to be generated')

    # optional arguments
    group = parser.add_argument_group('general arguments')
    group.add_argument("-h", "--help", action="help",
                       help="show this help message and exit")
    group.add_argument('--pseudonym', metavar='LIST',
                       help='set candidate columns separated by a comma, which '
                            'will be replaced with a pseudonym. It only works '
                            'on the string column.')
    group.add_argument('--delete', metavar='LIST',
                       help='set columns separated by a comma, which will be '
                            'deleted when synthesis.')
    group.add_argument('--na-values', metavar='LIST',
                       help='set additional values to recognize as NA/NaN; '
                            '(default null values are from pandas.read_csv)')
    group.add_argument('-o', '--output', metavar='FILE',
                       help="set the file name of output synthesized dataset ("
                            "default is input file name with suffix '-a.csv')")
    group.add_argument('--no-header', action='store_true',
                       help='indicate there is no header in a CSV file, and '
                            'will take [#0, #1, #2, ...] as header. (default: '
                            'the tool will try to detect and take actions)')
    group.add_argument('--records', metavar='INT', type=int,
                       help='specify the records you want to generate; default '
                            'is the same records with the original dataset')
    group.add_argument('--sep', metavar='STRING', default=',',
                       help='specify the delimiter of the input file')

    group = parser.add_argument_group('advanced arguments')
    group.add_argument('-e', '--epsilon', metavar='FLOAT', type=float,
                       help='set epsilon for differential privacy (default 0.1)',
                       default=0.1)
    group.add_argument('--category', metavar='LIST',
                       help='set categorical columns separated by a comma.')
    group.add_argument('--retain', metavar='LIST',
                       help='set columns to retain the values')

    args = parser.parse_args()
    start = time.time()

    pseudonyms = str_to_list(args.pseudonym)
    deletes = str_to_list(args.delete)
    categories = str_to_list(args.category)
    na_values = str_to_list(args.na_values)
    retains = str_to_list(args.retain)
    header = None if args.no_header else 'infer'

    # check the file type from its extension
    is_pattern = ends_with_json(args.file)
    if is_pattern:
        if retains is not None and len(retains) != 0:
            parser.exit(message='Do not support --retain option when '
                                'synthesize from pattern file.')
        # construct DataSet from pattern file
        dataset = DataSet.from_pattern(args.file)
    else:
        data = read_data_from_csv(args.file, na_values=na_values, header=header,
                                  sep=args.sep)

        def complement(attrs, full):
            return set(attrs or []) - set(full)

        # check parameters: pseudonyms, deletes, categories
        comp = complement(pseudonyms, data.columns)
        if comp:
            parser.exit(
                message=f'--pseudonym columns: {comp} are not in csv file.')
        comp = complement(deletes, data.columns)
        if comp:
            parser.exit(
                message=f'--delete columns: {comp} are not in csv file.')
        comp = complement(categories, data.columns)
        if comp:
            parser.exit(
                message=f'--category columns: {comp} are not in csv file.')

        dataset = DataSet(data, categories=categories)

    synthesized = dataset.synthesize(epsilon=args.epsilon,
                                     pseudonyms=pseudonyms, deletes=deletes,
                                     retains=retains, records=args.records)
    if args.output is None:
        name = file_name(args.file)
        args.output = f'{name}-a.csv'
    synthesized.to_csv(args.output, index=False, sep=args.sep)

    duration = time.time() - start
    print(f'Synthesize from {args.file} to file {args.output} in '
          f'{round(duration, 2)} seconds.')
def main():
    parser = argparse.ArgumentParser(
        description='Serialize patterns of a dataset anonymously',
        formatter_class=CustomFormatter,
        add_help=False)
    parser.add_argument('file', help='set path of a csv file to be patterned '
                                     'anonymously')

    # optional arguments
    group = parser.add_argument_group('general arguments')
    group.add_argument("-h", "--help", action="help",
                       help="show this help message and exit")
    group.add_argument('--pseudonym', metavar='LIST',
                       help='set candidate columns separated by a comma, which '
                            'will be replaced with a pseudonym. It only works '
                            'on the string column.')
    group.add_argument('--delete', metavar='LIST',
                       help='set columns separated by a comma, which will be '
                            'deleted when synthesis.')
    group.add_argument('--na-values', metavar='LIST',
                       help='set additional values to recognize as NA/NaN; '
                            '(default null values are from pandas.read_csv)')
    group.add_argument('-o', '--output', metavar='FILE',
                       help="set the file name of anonymous patterns (default "
                            "is input file name with a suffix '-pattern.json')")
    group.add_argument('--no-header', action='store_true',
                       help='indicate there is no header in a CSV file, and '
                            'will take [#0, #1, #2, ...] as header. (default: '
                            'the tool will try to detect and take actions)')
    group.add_argument('--sep', metavar='STRING',
                       help='specify the delimiter of the input file')

    group = parser.add_argument_group('advanced arguments')
    group.add_argument('-e', '--epsilon', metavar='FLOAT', type=float,
                       help='set epsilon for differential privacy (default 0.1)',
                       default=0.1)
    group.add_argument('--category', metavar='LIST',
                       help='set categorical columns separated by a comma.')

    args = parser.parse_args()
    start = time.time()

    pseudonyms = str_to_list(args.pseudonym)
    deletes = str_to_list(args.delete)
    categories = str_to_list(args.category)
    na_values = str_to_list(args.na_values)
    header = None if args.no_header else 'infer'
    sep = ',' if args.sep is None else args.sep

    data = read_data_from_csv(args.file, na_values=na_values, header=header,
                              sep=sep)

    def complement(attrs, full):
        return set(attrs or []) - set(full)

    # check parameters: pseudonyms, deletes, categories
    comp = complement(pseudonyms, data.columns)
    if comp:
        parser.exit(message=f'--pseudonym columns: {comp} are not in csv file.')
    comp = complement(deletes, data.columns)
    if comp:
        parser.exit(message=f'--delete columns: {comp} are not in csv file.')
    comp = complement(categories, data.columns)
    if comp:
        parser.exit(message=f'--category columns: {comp} are not in csv file.')

    dataset = DataSet(data, categories=categories)

    if args.output is None:
        name = file_name(args.file)
        args.output = f'{name}-pattern.json'
    dataset.to_pattern(path=args.output, epsilon=args.epsilon, deletes=deletes,
                       pseudonyms=pseudonyms, retains=[])

    duration = time.time() - start
    print(f'Analyze and serialize the patterns of {args.file} at {args.output} '
          f'in {round(duration, 2)} seconds.')
def main():
    parser = argparse.ArgumentParser(
        description='Synthesize one dataset by Differential Privacy',
        formatter_class=CustomFormatter,
        add_help=False)
    parser.add_argument('file', help='set path of the CSV to be synthesized')

    # optional arguments
    group = parser.add_argument_group('general arguments')
    group.add_argument("-h",
                       "--help",
                       action="help",
                       help="show this help message and exit")
    group.add_argument(
        '--pseudonym',
        metavar='LIST',
        help='set candidate columns separated by a comma, which will be '
        'replaced with a pseudonym. '
        'It only works on the string column.')
    group.add_argument('--delete',
                       metavar='LIST',
                       help='set columns separated by a comma, which will be '
                       'deleted when synthesis.')
    group.add_argument('--na-values',
                       metavar='LIST',
                       help='set additional values to recognize as NA/NaN; '
                       '(default null values are from pandas.read_csv)')
    group.add_argument('-o',
                       '--output',
                       metavar='FILE',
                       help="set the file name of output synthesized dataset ("
                       "default is input file name with suffix '_a')")
    group.add_argument('--no-header',
                       action='store_true',
                       help='indicate there is no header in a CSV file, and '
                       'will take [#0, #1, #2, ...] as header. (default: '
                       'the tool will try to detect and take actions)')

    group = parser.add_argument_group('advanced arguments')
    group.add_argument(
        '-e',
        '--epsilon',
        metavar='FLOAT',
        type=float,
        help='set epsilon for differential privacy (default 0.1)',
        default=0.1)
    group.add_argument('--category',
                       metavar='LIST',
                       help='set categorical columns separated by a comma.')
    group.add_argument('--retain',
                       metavar='LIST',
                       help='set columns to retain the values')

    args = parser.parse_args()
    start = time.time()

    pseudonyms = str_to_list(args.pseudonym)
    deletes = str_to_list(args.delete)
    categories = str_to_list(args.category)
    na_values = str_to_list(args.na_values)
    retains = str_to_list(args.retain)
    header = None if args.no_header else 'infer'

    data = read_data_from_csv(args.file, na_values=na_values, header=header)

    def complement(attrs, full):
        return set(attrs or []) - set(full)

    # check parameters: pseudonyms, deletes, categories
    comp = complement(pseudonyms, data.columns)
    if comp:
        parser.exit(
            message=f'--pseudonym columns: {comp} are not in csv file.')
    comp = complement(deletes, data.columns)
    if comp:
        parser.exit(message=f'--delete columns: {comp} are not in csv file.')
    comp = complement(categories, data.columns)
    if comp:
        parser.exit(message=f'--category columns: {comp} are not in csv file.')

    dataset = DataSet(data, categories=categories)
    synthesized = dataset.synthesize(epsilon=args.epsilon,
                                     pseudonyms=pseudonyms,
                                     deletes=deletes,
                                     retains=retains)
    if args.output is None:
        name = file_name(args.file)
        args.output = f'{name}_a.csv'
    synthesized.to_csv(args.output, index=False)

    duration = time.time() - start
    print(f'Synthesized data {args.output} in {round(duration, 2)} seconds.')