Exemple #1
0
def test_mismatched_tags_raises_assertion_error():
    now = datetime.datetime.utcnow()
    x1 = DatasetProfile('test', now, tags=['foo'])
    x2 = DatasetProfile('test', now, tags=['bar'])
    try:
        x1.merge(x2)
        raise RuntimeError('Assertion error not raised')
    except AssertionError:
        pass
Exemple #2
0
def test_merge_different_columns():
    now = datetime.datetime.utcnow()
    shared_session_id = uuid4().hex
    x1 = DatasetProfile(name='test',
                        session_id=shared_session_id,
                        session_timestamp=now,
                        tags=['tag'],
                        metadata={'key': 'value'})
    x1.track('col1', 'value')
    x2 = DatasetProfile(name='test',
                        session_id=shared_session_id,
                        session_timestamp=now,
                        tags=['tag'],
                        metadata={'key': 'value'})
    x2.track('col2', 'value')

    merged = x1.merge(x2)

    assert merged.name == 'test'
    assert merged.session_id == shared_session_id
    assert merged.session_timestamp == now
    assert set(list(merged.columns.keys())) == {'col1', 'col2'}
    assert merged.columns['col1'].counters.count == 1
    assert merged.columns['col2'].counters.count == 1
    assert len(merged.tags) == 1
    assert 'tag' in merged.tags
Exemple #3
0
def test_empty_valid_datasetprofiles_empty():
    now = datetime.datetime.utcnow()
    shared_session_id = uuid4().hex
    x1 = DatasetProfile(name='test',
                        session_id=shared_session_id,
                        session_timestamp=now,
                        tags=['tag'],
                        metadata={'key': 'value'})
    x2 = DatasetProfile(name='test',
                        session_id=shared_session_id,
                        session_timestamp=now,
                        tags=['tag'],
                        metadata={'key': 'value'})

    merged = x1.merge(x2)
    assert merged.name == 'test'
    assert merged.session_id == shared_session_id
    assert merged.session_timestamp == now
    assert merged.columns == {}
def test_log_dataframe(df_lending_club):
    reset_default()
    get_or_create_session(
        output_to_stdout=False,
        output_to_cloud=False,
        local_output_folder=TEST_OUTPUT_DIR,
        pipeline="pipeline_name",
        team="team_name",
    )
    logger = get_logger()
    response = logger.log_dataframe(df_lending_club, 'lending_club')
    rs = response['handler_responses']
    assert len(rs) == 3
    expected_outputs = {
        ('protobuf', 'disk'),
        ('flat', 'disk'),
        ('json', 'disk'),
    }
    outputs = set()
    for r in rs:
        handler_response = r['response']
        if r['fmt'] == 'protobuf' and r['dest'] == 'disk':
            outputs.add(('protobuf', 'disk'))
            # Verify that we can load the protobuf
            with open(handler_response.dest, 'rb') as fp:
                DatasetProfile.from_protobuf_string(fp.read())
        elif r['fmt'] == 'flat' and r['dest'] == 'disk':
            outputs.add(('flat', 'disk'))
            # Verify we can load all the flat datas
            pd.read_csv(handler_response.dest['flat_table'])
            json.load(open(handler_response.dest['histogram'], 'rt'))
            json.load(open(handler_response.dest['freq_strings'], 'rt'))
        elif r['fmt'] == 'json' and r['dest'] == 'disk':
            outputs.add(('json', 'disk'))
            # Verify we can load the JSON summary
            json.load(open(handler_response.dest, 'rt'))

    assert outputs == expected_outputs

    shutil.rmtree(TEST_OUTPUT_DIR)
Exemple #5
0
def test_non_sorted_tags_raise_value_error():
    now = datetime.datetime.utcnow()
    tags = ['tag1', 'tag2']
    x = DatasetProfile('test', now, tags=tags)
    x.validate()
    # Include a tag which will not be sorted
    x.tags = tags + ['aaaa']
    try:
        x.validate()
        raise RuntimeError("validate should raise an ValueError")
    except ValueError:
        pass
Exemple #6
0
def test_non_string_tag_raises_assert_error():
    now = datetime.datetime.utcnow()
    tags = ['tag1', 'tag2']
    x = DatasetProfile('test', now, tags=tags)
    x.validate()
    # Include a non-string tag
    x.tags = tags + [1]
    try:
        x.validate()
        raise RuntimeError("validate should raise an AssertionError")
    except AssertionError:
        pass
Exemple #7
0
def test_protobuf_round_trip():
    now = datetime.datetime.utcnow()
    tags = ('rock', 'scissors', 'paper')
    original = DatasetProfile(name="test", data_timestamp=now, tags=tags)
    original.track('col1', 'value')
    original.track('col2', 'value')

    msg = original.to_protobuf()
    roundtrip = DatasetProfile.from_protobuf(msg)

    assert roundtrip.to_protobuf() == msg
    assert roundtrip.name == 'test'
    assert roundtrip.session_id == original.session_id
    assert to_utc_ms(roundtrip.session_timestamp) == to_utc_ms(
        original.session_timestamp)
    assert set(list(roundtrip.columns.keys())) == {'col1', 'col2'}
    assert roundtrip.columns['col1'].counters.count == 1
    assert roundtrip.columns['col2'].counters.count == 1
    assert set(roundtrip.tags) == set(tags)
    assert roundtrip.metadata == original.metadata
Exemple #8
0
def test_name_always_appear_in_metadata():
    x1 = DatasetProfile(name='test')
    assert x1.metadata['Name'] == 'test'
Exemple #9
0
def run(input_path,
        datetime: str = None,
        delivery_stream=None,
        fmt=None,
        limit=-1,
        output_prefix=None,
        region=None,
        separator=None,
        dropna=False,
        infer_dtypes=False):
    """
    Run the profiler on CSV data

    Output Notes
    ------------
    <output_prefix>_<name>_summary.csv
        Dataset profile.  Contains scalar statistics per column
    <output_prefix>_<name>_histogram.json
        Histograms for each column for dataset `name`
    <output_prefix>_<name>_strings.json
        Frequent strings
    <output_prefix>.json
        DatasetSummaries, nested JSON summaries of dataset statistics
    <output_prefix>.bin
        Binary protobuf output of DatasetProfile


    Parameters
    ----------
    input_path : str
        Input CSV file
    datetime : str
        Column containing timestamps.  If missing, we assume the dataset is
        running in batch mode
    delivery_stream : str
        [IGNORED] The delivery stream name
    fmt : str
        Format of the datetime column, used if `datetime` is specified.
        If not specified, the format will be attempt to be inferred.
    limit : int
        Limit the number of entries to processes
    output_prefix : str
        Specify a prefix for the output files.  By default, this will be
        derived from the input path to generate files in the input directory.
        Can include folders
    region : str
        [IGNORED] AWS region name for Firehose
    separator : str
        Record separator.  Default = ','
    dropna : bool
        Drop null values when reading
    infer_dtypes : bool
        Infer input datatypes when reading.  If false, treat inputs as
        un-converted strings.
    """
    datetime_col = datetime  # don't shadow the standard module name
    from whylabs.logs.core import DatasetProfile
    from whylabs.logs.proto import DatasetSummaries
    from whylabs.logs.util.protobuf import message_to_json
    from datetime import datetime
    import os
    logger = getLogger(LOGGER)

    # Parse arguments
    if separator is None:
        separator = ','
    name = os.path.basename(input_path)
    parse_dates = False
    if datetime_col is not None:
        parse_dates = [datetime_col]
    nrows = None
    if limit > 0:
        nrows = limit
    if output_prefix is None:
        import random
        import time
        parent_folder = os.path.dirname(os.path.realpath(input_path))
        basename = os.path.splitext(os.path.basename(input_path))[0]
        epoch_minutes = int(time.time() / 60)
        output_base = "{}.{}-{}-{}".format(basename, epoch_minutes,
                                           random.randint(100000, 999999),
                                           random.randint(100000, 999999))
        output_prefix = os.path.join(parent_folder, output_base)

    output_base = output_prefix
    binary_output_path = output_base + '.bin'
    json_output_path = output_base + '.json'

    # Process records
    reader = csv_reader(input_path,
                        fmt,
                        parse_dates=parse_dates,
                        nrows=nrows,
                        sep=separator,
                        dropna=dropna,
                        infer_dtypes=infer_dtypes)
    profiles = {}
    for record in reader:
        dt = record.get(datetime_col, datetime.utcnow())
        assert isinstance(dt, datetime)
        dt_str = dt.strftime(OUTPUT_DATE_FORMAT)
        try:
            ds = profiles[dt_str]
        except KeyError:
            ds = DatasetProfile(name, dt)
            profiles[dt_str] = ds
        ds.track(record)

    logger.info("Finished collecting statistics")

    # Build summaries for the JSON output
    summaries = DatasetSummaries(
        profiles={k: v.to_summary()
                  for k, v in profiles.items()})
    with open(json_output_path, 'wt') as fp:
        logger.info("Writing JSON summaries to: {}".format(json_output_path))
        fp.write(message_to_json(summaries))

    # Generate flat summary outputs
    fnames = write_flat_summaries(summaries, output_base, dataframe_fmt='csv')

    # Write the protobuf binary file
    write_protobuf(profiles.values(), binary_output_path)
    return profiles
Exemple #10
0
def _write_protobuf(path: str, profile: DatasetProfile):
    protobuf: Message = profile.to_protobuf()
    with open(os.path.join('protobuf.bin'), 'wb') as f:
        f.write(protobuf.SerializeToString())
Exemple #11
0
def _write_flat(path: str, profile: DatasetProfile):
    summary = profile.to_summary()
    flat_summary: dict = datasetprofile.flatten_summary(summary)
    # TODO: use absolute path when writing out data
    os.chdir(path)
    datasetprofile.write_flat_dataset_summary(flat_summary, 'summary')
Exemple #12
0
def _write_json(path: str, profile: DatasetProfile):
    summary = profile.to_summary()
    with open(os.path.join(path, 'whylogs.json'), 'wt') as f:
        f.write(message_to_json(summary))