def test_one_row_dataframe(): items = [ ("a", [1]), ("b", [-0.5]), ("c", ["hello"]), ("d", [datetime.datetime.now()]), ] columns = sorted([item[0] for item in items]) df = pd.DataFrame.from_dict(dict(items)) report = summarise(df)._report assert sorted(report["_columns"]) == columns column_properties = report["column_properties"] for column in columns: props = column_properties[column] assert props["nulls"] == 0 assert props["notnulls"] == 1 assert props["unique"] == 1 assert column_properties["a"]["dtype"] == "int64" assert column_properties["b"]["dtype"] == "float64" assert column_properties["c"]["dtype"] == "object" assert column_properties["d"]["dtype"] == "datetime64[ns]" column_summary = report["column_summary"] assert column_summary["a"]["max"] == 1 assert column_summary["a"]["min"] == 1 assert column_summary["a"]["mean"] == 1.0 assert column_summary["a"]["median"] == 1.0 assert column_summary["a"]["iqr"] == 0.0 assert column_summary["b"]["max"] == -0.5 assert column_summary["b"]["min"] == -0.5 assert column_summary["b"]["median"] == -0.5 assert column_summary["b"]["mean"] == -0.5
def test_one_row_dataframe(): items = [ ('a', [1]), ('b', [-0.5]), ('c', ['hello']), ('d', [datetime.datetime.now()]) ] columns = sorted([item[0] for item in items]) df = pd.DataFrame.from_items(items) report = summarise(df)._report assert sorted(report['_columns']) == columns column_properties = report['column_properties'] for column in columns: props = column_properties[column] assert props['nulls'] == 0 assert props['notnulls'] == 1 assert props['unique'] == 1 assert column_properties['a']['dtype'] == 'int64' assert column_properties['b']['dtype'] == 'float64' assert column_properties['c']['dtype'] == 'object' assert column_properties['d']['dtype'] == 'datetime64[ns]' column_summary = report['column_summary'] assert column_summary['a']['max'] == 1 assert column_summary['a']['min'] == 1 assert column_summary['a']['mean'] == 1.0 assert column_summary['a']['median'] == 1.0 assert column_summary['a']['iqr'] == 0.0 assert column_summary['b']['max'] == -0.5 assert column_summary['b']['min'] == -0.5 assert column_summary['b']['median'] == -0.5 assert column_summary['b']['mean'] == -0.5
def test_correlation_matrix_one_column(): column_values = np.random.ranf(size=200) df = pd.DataFrame.from_dict({"a": column_values}) summary = summarise(df) columns, correlation_matrix = summary.correlation_matrix() assert columns == ["a"] assert correlation_matrix.shape == (1, 1) numpy.testing.assert_approx_equal(correlation_matrix[0, 0], 1.0)
def test_zero_rows_dataframe(): columns = sorted(['a', 'b', 'c', 'd']) df = pd.DataFrame(columns=columns) report = summarise(df)._report assert sorted(report['_columns']) == columns for column in columns: props = report['column_properties'][column] assert props['nulls'] == 0 assert props['notnulls'] == 0 assert props['unique'] == 0
def test_zero_rows_dataframe(): columns = sorted(["a", "b", "c", "d"]) df = pd.DataFrame(columns=columns) report = summarise(df)._report assert sorted(report["_columns"]) == columns for column in columns: props = report["column_properties"][column] assert props["nulls"] == 0 assert props["notnulls"] == 0 assert props["unique"] == 0
def test_dask_compute_graph(df, scheduler, num_workers, pairdensities): dreport = summarise( df, scheduler=scheduler, num_workers=num_workers, pairdensities=pairdensities)._report fname = None if scheduler == 'multiprocessing' and num_workers is None: fname = '{}/test_results/report_test_data_{}.json'.format(dirname, 'mp') assert dreport['_lens_version'] == __version__ if not pairdensities: assert dreport['pairdensity'] == {'_columns': [], '_run_time': 0.0} serialize_full_report(dreport, fname=fname)
def test_correlation_matrix_two_columns(): column1_values = np.random.ranf(size=200) column2_values = np.random.ranf(size=200) df = pd.DataFrame.from_dict({"a": column1_values, "b": column2_values}) summary = summarise(df) columns, correlation_matrix = summary.correlation_matrix() assert sorted(columns) == ["a", "b"] numpy.testing.assert_approx_equal(correlation_matrix[0, 0], 1.0) numpy.testing.assert_approx_equal(correlation_matrix[1, 1], 1.0) off_diagonal_term = scipy.stats.spearmanr(df["a"], df["b"]).correlation numpy.testing.assert_approx_equal(correlation_matrix[1, 0], off_diagonal_term) numpy.testing.assert_approx_equal(correlation_matrix[0, 1], off_diagonal_term)
def test_summary_regression(input_): # load the input into a pandas dataframe df = pd.read_csv("s3://{}/input/{}".format(BUCKET, input_)) # run the lens summarise method summary = lens.summarise(df) # Save generated report summary.to_json(os.path.join(result_dir, input_.replace(".csv", ".json"))) # load the expected output file into a summary object output = input_.replace(".csv", ".json") s3_summary = read_s3_file(BUCKET, "output/{}".format(output))[ "Body" ].read() if isinstance(s3_summary, bytes): s3_summary = s3_summary.decode("utf-8") expected_summary = json.loads(s3_summary) # list of keys to ignore from the response because they are # probablistically generated exclude = [ "_run_time", "tdigest", "density", "bw", "logtrans_IQR", "kde", "_lens_version", ] diffs = find_diff( json.loads(json.dumps(summary._report)), expected_summary, exclude ) for diff in diffs: print(diff) if len(diffs): # Save expected report to check the differences manually if needed exp_name = os.path.join( result_dir, output.replace(".json", "-expected.json") ) with open(exp_name, "w") as f: f.write(s3_summary) # compare the input and output summary objects assert len(diffs) == 0
def test_correlation_matrix_three_columns(): column_values = [np.random.ranf(size=200) for i in range(3)] column_headers = ["a", "b", "c"] df = pd.DataFrame.from_dict(dict(zip(column_headers, column_values))) summary = summarise(df) columns, correlation_matrix = summary.correlation_matrix() assert sorted(columns) == column_headers for i, first_column in enumerate(columns): for j, second_column in enumerate(columns): expected = scipy.stats.spearmanr(df[first_column], df[second_column]).correlation actual = correlation_matrix[i, j] numpy.testing.assert_approx_equal(expected, actual)
def test_correlation_matrix_two_columns(): column1_values = np.random.ranf(size=200) column2_values = np.random.ranf(size=200) df = pd.DataFrame.from_items([('a', column1_values), ('b', column2_values)]) summary = summarise(df) columns, correlation_matrix = summary.correlation_matrix() assert sorted(columns) == ['a', 'b'] numpy.testing.assert_approx_equal(correlation_matrix[0, 0], 1.0) numpy.testing.assert_approx_equal(correlation_matrix[1, 1], 1.0) off_diagonal_term = scipy.stats.spearmanr(df['a'], df['b']).correlation numpy.testing.assert_approx_equal(correlation_matrix[1, 0], off_diagonal_term) numpy.testing.assert_approx_equal(correlation_matrix[0, 1], off_diagonal_term)
def test_dask_compute_graph(df, scheduler, num_workers, pairdensities): dreport = summarise( df, scheduler=scheduler, num_workers=num_workers, pairdensities=pairdensities, )._report fname = None if scheduler == "multiprocessing" and num_workers is None: fname = "{}/test_results/report_test_data_{}.json".format( dirname, "mp" ) assert dreport["_lens_version"] == __version__ if not pairdensities: assert dreport["pairdensity"] == {"_columns": [], "_run_time": 0.0} serialize_full_report(dreport, fname=fname)
def test_summary_regression(input_): # load the input into a pandas dataframe df = pd.read_csv('s3://{}/input/{}'.format(BUCKET, input_)) # run the lens summarise method summary = lens.summarise(df) # Save generated report summary.to_json(os.path.join(result_dir, input_.replace('.csv', '.json'))) # load the expected output file into a summary object output = input_.replace('.csv', '.json') s3_summary = (read_s3_file(BUCKET, 'output/{}'.format(output))['Body'].read()) if isinstance(s3_summary, bytes): s3_summary = s3_summary.decode('utf-8') expected_summary = json.loads(s3_summary) # list of keys to ignore from the response because they are # probablistically generated exclude = [ '_run_time', 'tdigest', 'density', 'bw', 'logtrans_IQR', 'kde', '_lens_version' ] diffs = find_diff(json.loads(json.dumps(summary._report)), expected_summary, exclude) for diff in diffs: print(diff) if len(diffs): # Save expected report to check the differences manually if needed exp_name = os.path.join(result_dir, output.replace('.json', '-expected.json')) with open(exp_name, 'w') as f: f.write(s3_summary) # compare the input and output summary objects assert len(diffs) == 0
def test_int_num_cpus_env(small_df, monkeypatch): num_cpus_env = 2 monkeypatch.setenv("NUM_CPUS", str(num_cpus_env)) ls = summarise(small_df) assert set(ls._report["_columns"]) == set(small_df.columns)
def test_string_num_cpus_env(small_df, monkeypatch): monkeypatch.setenv("NUM_CPUS", "not-an-int") ls = summarise(small_df) assert set(ls._report["_columns"]) == set(small_df.columns)
def test_string_num_cpus_env(small_df, monkeypatch): monkeypatch.setenv('NUM_CPUS', 'not-an-int') ls = summarise(small_df) assert set(ls._report['_columns']) == set(small_df.columns)
def test_empty_df(): empty_df = pd.DataFrame() with pytest.raises(EmptyDataFrameError): summarise(empty_df)
def artworks_summary(artworks_df): summary = lens.summarise(artworks_df) return summary