def test_dist(): frame = BiFrame(DataFrame(adults01), DataFrame(adults02)) columns = ['age', 'education', 'relationship', 'salary', 'birth'] for col in columns: bins, counts = frame.dist(col) assert len(bins) == len(counts[0]) assert len(bins) == len(counts[1])
def test_to_html(): frame = BiFrame(DataFrame(adults01), DataFrame(adults02)) report = 'a.html' frame.to_html(report, classifier='SVM', labels=['education']) import os.path assert os.path.isfile(report) if os.path.exists(report): os.remove(report)
def test_corr(): frame = BiFrame(DataFrame(adults01), DataFrame(adults02)) a_mi, b_mi = frame.corr() from numpy import allclose, alltrue assert allclose(a_mi, a_mi.T) assert alltrue(a_mi >= 0.0) assert alltrue(a_mi <= 1.0) assert allclose(b_mi, b_mi.T) assert alltrue(b_mi >= 0.0) assert alltrue(b_mi <= 1.0)
def test_classify_multiple_classes(): frame = BiFrame(DataFrame(adults01), DataFrame(adults02)) matrix = frame.classify('education') assert len(matrix) == 2 columns = ['11th', '7th-8th', '9th', 'Assoc-acdm', 'Assoc-voc', 'Bachelors', 'Doctorate', 'HS-grad', 'Masters', 'Some-college'] assert array_equal(matrix[0].columns, columns) assert array_equal(matrix[0].index, columns) assert array_equal(matrix[1].columns, columns) assert array_equal(matrix[1].index, columns)
def test_describe(): frame = BiFrame(DataFrame(adults01), DataFrame(adults02)) desc = frame.describe() from numpy import alltrue columns = ['age', 'birth', 'education', 'relationship', 'salary'] # 'sex' not in adults01 assert array_equal(desc.columns, columns) assert alltrue(desc >= 0.0) assert alltrue(desc <= 1.0) assert array_equal(desc.index, ['err', 'jsd'])
def main(): parser = argparse.ArgumentParser( description='Evaluate the utility of synthesized dataset compared with ' 'the source dataset.', formatter_class=CustomFormatter, add_help=False) # positional arguments parser.add_argument('source', help='set file path of source (raw) dataset to be ' 'compared with synthesized dataset, only support ' 'CSV files') parser.add_argument( 'target', help='set file path of target (synthesized) dataset to ' 'evaluate') # optional arguments group = parser.add_argument_group('general arguments') group.add_argument("-h", "--help", action="help", help="show this help message and exit") group.add_argument('--na-values', metavar='LIST', help='set additional values to recognize as NA/NaN; (' 'default null values are from pandas.read_csv)') group.add_argument('-o', '--output', metavar='FILE', default='report.html', help='set output path for evaluation report; (default ' 'is "report.html" under current work directory)') group = parser.add_argument_group('advanced arguments') group.add_argument('--category', metavar='LIST', help='set categorical columns separated by a comma.') group.add_argument( '-t', '--test', help='set test dataset for classification or regression ' 'task; (default take 20%% from source dataset)') group.add_argument( '--class-label', metavar='LIST', help='set column name as class label for classification ' 'or regression task; supports one or multiple ' 'columns (separated by comma)') args = parser.parse_args() start = time.time() na_values = str_to_list(args.na_values) class_labels = str_to_list(args.class_label) categories = str_to_list(args.category) # check kinds of parameters args.output = os.path.join(os.getcwd(), args.output) # if output folder not exists, then create it. if not os.path.exists(os.path.dirname(args.output)): os.makedirs(os.path.dirname(args.output)) def complement(attrs, full): return set(attrs or []) - set(full) # Initialization task: source = read_data_from_csv(args.source, na_values=na_values, header='infer') target = read_data_from_csv(args.target, na_values=na_values, header='infer') test = read_data_from_csv(args.test) if args.test is not None else None comp = complement(class_labels, source.columns) if comp: parser.exit( message=f'--class-label(s): {comp} are not in source file.') comp = complement(class_labels, target.columns) if comp: parser.exit( message=f'--class-label(s): {comp} are not in target file.') frame = BiFrame(source, target, categories=categories) frame.to_html(buffer=args.output, title='Data Utility Evaluation Report', labels=class_labels, test=test) duration = time.time() - start print(f'Evaluate dataset {args.source} and {args.target} and generate ' f'report at {args.output} in {round(duration, 2)} seconds.')
def test_classify_one_class(): frame = BiFrame(DataFrame(adults01), DataFrame(adults02)) matrix = frame.classify('salary') assert len(matrix) == 2 assert array_equal(matrix[0].columns, ['<=50K', '>50K']) assert array_equal(matrix[0].index, ['<=50K', '>50K'])