def test_mismatched_tags_raises_assertion_error(): now = datetime.datetime.utcnow() x1 = DatasetProfile('test', now, tags=['foo']) x2 = DatasetProfile('test', now, tags=['bar']) try: x1.merge(x2) raise RuntimeError('Assertion error not raised') except AssertionError: pass
def test_merge_different_columns(): now = datetime.datetime.utcnow() shared_session_id = uuid4().hex x1 = DatasetProfile(name='test', session_id=shared_session_id, session_timestamp=now, tags=['tag'], metadata={'key': 'value'}) x1.track('col1', 'value') x2 = DatasetProfile(name='test', session_id=shared_session_id, session_timestamp=now, tags=['tag'], metadata={'key': 'value'}) x2.track('col2', 'value') merged = x1.merge(x2) assert merged.name == 'test' assert merged.session_id == shared_session_id assert merged.session_timestamp == now assert set(list(merged.columns.keys())) == {'col1', 'col2'} assert merged.columns['col1'].counters.count == 1 assert merged.columns['col2'].counters.count == 1 assert len(merged.tags) == 1 assert 'tag' in merged.tags
def test_empty_valid_datasetprofiles_empty(): now = datetime.datetime.utcnow() shared_session_id = uuid4().hex x1 = DatasetProfile(name='test', session_id=shared_session_id, session_timestamp=now, tags=['tag'], metadata={'key': 'value'}) x2 = DatasetProfile(name='test', session_id=shared_session_id, session_timestamp=now, tags=['tag'], metadata={'key': 'value'}) merged = x1.merge(x2) assert merged.name == 'test' assert merged.session_id == shared_session_id assert merged.session_timestamp == now assert merged.columns == {}
def test_log_dataframe(df_lending_club): reset_default() get_or_create_session( output_to_stdout=False, output_to_cloud=False, local_output_folder=TEST_OUTPUT_DIR, pipeline="pipeline_name", team="team_name", ) logger = get_logger() response = logger.log_dataframe(df_lending_club, 'lending_club') rs = response['handler_responses'] assert len(rs) == 3 expected_outputs = { ('protobuf', 'disk'), ('flat', 'disk'), ('json', 'disk'), } outputs = set() for r in rs: handler_response = r['response'] if r['fmt'] == 'protobuf' and r['dest'] == 'disk': outputs.add(('protobuf', 'disk')) # Verify that we can load the protobuf with open(handler_response.dest, 'rb') as fp: DatasetProfile.from_protobuf_string(fp.read()) elif r['fmt'] == 'flat' and r['dest'] == 'disk': outputs.add(('flat', 'disk')) # Verify we can load all the flat datas pd.read_csv(handler_response.dest['flat_table']) json.load(open(handler_response.dest['histogram'], 'rt')) json.load(open(handler_response.dest['freq_strings'], 'rt')) elif r['fmt'] == 'json' and r['dest'] == 'disk': outputs.add(('json', 'disk')) # Verify we can load the JSON summary json.load(open(handler_response.dest, 'rt')) assert outputs == expected_outputs shutil.rmtree(TEST_OUTPUT_DIR)
def test_non_sorted_tags_raise_value_error(): now = datetime.datetime.utcnow() tags = ['tag1', 'tag2'] x = DatasetProfile('test', now, tags=tags) x.validate() # Include a tag which will not be sorted x.tags = tags + ['aaaa'] try: x.validate() raise RuntimeError("validate should raise an ValueError") except ValueError: pass
def test_non_string_tag_raises_assert_error(): now = datetime.datetime.utcnow() tags = ['tag1', 'tag2'] x = DatasetProfile('test', now, tags=tags) x.validate() # Include a non-string tag x.tags = tags + [1] try: x.validate() raise RuntimeError("validate should raise an AssertionError") except AssertionError: pass
def test_protobuf_round_trip(): now = datetime.datetime.utcnow() tags = ('rock', 'scissors', 'paper') original = DatasetProfile(name="test", data_timestamp=now, tags=tags) original.track('col1', 'value') original.track('col2', 'value') msg = original.to_protobuf() roundtrip = DatasetProfile.from_protobuf(msg) assert roundtrip.to_protobuf() == msg assert roundtrip.name == 'test' assert roundtrip.session_id == original.session_id assert to_utc_ms(roundtrip.session_timestamp) == to_utc_ms( original.session_timestamp) assert set(list(roundtrip.columns.keys())) == {'col1', 'col2'} assert roundtrip.columns['col1'].counters.count == 1 assert roundtrip.columns['col2'].counters.count == 1 assert set(roundtrip.tags) == set(tags) assert roundtrip.metadata == original.metadata
def test_name_always_appear_in_metadata(): x1 = DatasetProfile(name='test') assert x1.metadata['Name'] == 'test'
def run(input_path, datetime: str = None, delivery_stream=None, fmt=None, limit=-1, output_prefix=None, region=None, separator=None, dropna=False, infer_dtypes=False): """ Run the profiler on CSV data Output Notes ------------ <output_prefix>_<name>_summary.csv Dataset profile. Contains scalar statistics per column <output_prefix>_<name>_histogram.json Histograms for each column for dataset `name` <output_prefix>_<name>_strings.json Frequent strings <output_prefix>.json DatasetSummaries, nested JSON summaries of dataset statistics <output_prefix>.bin Binary protobuf output of DatasetProfile Parameters ---------- input_path : str Input CSV file datetime : str Column containing timestamps. If missing, we assume the dataset is running in batch mode delivery_stream : str [IGNORED] The delivery stream name fmt : str Format of the datetime column, used if `datetime` is specified. If not specified, the format will be attempt to be inferred. limit : int Limit the number of entries to processes output_prefix : str Specify a prefix for the output files. By default, this will be derived from the input path to generate files in the input directory. Can include folders region : str [IGNORED] AWS region name for Firehose separator : str Record separator. Default = ',' dropna : bool Drop null values when reading infer_dtypes : bool Infer input datatypes when reading. If false, treat inputs as un-converted strings. """ datetime_col = datetime # don't shadow the standard module name from whylabs.logs.core import DatasetProfile from whylabs.logs.proto import DatasetSummaries from whylabs.logs.util.protobuf import message_to_json from datetime import datetime import os logger = getLogger(LOGGER) # Parse arguments if separator is None: separator = ',' name = os.path.basename(input_path) parse_dates = False if datetime_col is not None: parse_dates = [datetime_col] nrows = None if limit > 0: nrows = limit if output_prefix is None: import random import time parent_folder = os.path.dirname(os.path.realpath(input_path)) basename = os.path.splitext(os.path.basename(input_path))[0] epoch_minutes = int(time.time() / 60) output_base = "{}.{}-{}-{}".format(basename, epoch_minutes, random.randint(100000, 999999), random.randint(100000, 999999)) output_prefix = os.path.join(parent_folder, output_base) output_base = output_prefix binary_output_path = output_base + '.bin' json_output_path = output_base + '.json' # Process records reader = csv_reader(input_path, fmt, parse_dates=parse_dates, nrows=nrows, sep=separator, dropna=dropna, infer_dtypes=infer_dtypes) profiles = {} for record in reader: dt = record.get(datetime_col, datetime.utcnow()) assert isinstance(dt, datetime) dt_str = dt.strftime(OUTPUT_DATE_FORMAT) try: ds = profiles[dt_str] except KeyError: ds = DatasetProfile(name, dt) profiles[dt_str] = ds ds.track(record) logger.info("Finished collecting statistics") # Build summaries for the JSON output summaries = DatasetSummaries( profiles={k: v.to_summary() for k, v in profiles.items()}) with open(json_output_path, 'wt') as fp: logger.info("Writing JSON summaries to: {}".format(json_output_path)) fp.write(message_to_json(summaries)) # Generate flat summary outputs fnames = write_flat_summaries(summaries, output_base, dataframe_fmt='csv') # Write the protobuf binary file write_protobuf(profiles.values(), binary_output_path) return profiles
def _write_protobuf(path: str, profile: DatasetProfile): protobuf: Message = profile.to_protobuf() with open(os.path.join('protobuf.bin'), 'wb') as f: f.write(protobuf.SerializeToString())
def _write_flat(path: str, profile: DatasetProfile): summary = profile.to_summary() flat_summary: dict = datasetprofile.flatten_summary(summary) # TODO: use absolute path when writing out data os.chdir(path) datasetprofile.write_flat_dataset_summary(flat_summary, 'summary')
def _write_json(path: str, profile: DatasetProfile): summary = profile.to_summary() with open(os.path.join(path, 'whylogs.json'), 'wt') as f: f.write(message_to_json(summary))