def test_record_drop_field():
    rec = {'foo': 'bar', 'drop_me': 'bye'}
    drop = DropConfig()
    data_paths = [DataPath(input='drop_me', xforms=drop),
                  DataPath(input='*')]
    xf = DataTransformPipeline(data_paths)
    assert xf.transform_record(rec) == {'foo': 'bar'}
def test_pipe_record_fpe(record_and_meta_2):
    xf_fpe = FpeFloatConfig(secret='2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94', radix=10,
                            labels=['latitude'])
    data_paths = [DataPath(input='latitude', xforms=xf_fpe),
                  DataPath(input='*')]

    xf = DataTransformPipeline(data_paths)
    rf = DataRestorePipeline(data_paths)
    check = xf.transform_record(record_and_meta_2)
    assert check['record'] == {'summary': 'Alex Watson <*****@*****.**> works at Gretel. Alexander Ehrath '
                                          'used to work at Qualcomm.',
                               'dni': 'He loves 8.8.8.8 for DNS',
                               'city': 'San Diego',
                               'state': 'California', 'stuff': 'nothing labeled here',
                               'latitude': 124.10051071657566}

    check = rf.transform_record(check)
    assert check['record'] == {'summary': 'Alex Watson <*****@*****.**> works at Gretel. Alexander Ehrath '
                                          'used to work at Qualcomm.',
                               'dni': 'He loves 8.8.8.8 for DNS',
                               'city': 'San Diego',
                               'state': 'California', 'stuff': 'nothing labeled here',
                               'latitude': 112.221}

    check = xf.transform_record(check)
    assert check['record'] == {'summary': 'Alex Watson <*****@*****.**> works at Gretel. Alexander Ehrath '
                                          'used to work at Qualcomm.',
                               'dni': 'He loves 8.8.8.8 for DNS',
                               'city': 'San Diego',
                               'state': 'California', 'stuff': 'nothing labeled here',
                               'latitude': 124.10051071657566}
Ejemplo n.º 3
0
def test_bucketing():
    tup = BucketCreationParams(0.0, 1.0, 0.5)
    buckets = bucket_creation_params_to_list(tup, label_method="avg")
    paths = [
        DataPath(
            input="foo",
            xforms=BucketConfig(buckets=buckets,
                                lower_outlier_label=0.0,
                                upper_outlier_label=1.0),
        )
    ]
    pipe = DataTransformPipeline(paths)
    r = [{"foo": "bar"}, {"foo": -1}, {"foo": 0.1}, {"foo": 0.9}, {"foo": 1.1}]
    out = [pipe.transform_record(rec) for rec in r]
    assert out == [
        {
            "foo": "bar"
        },
        {
            "foo": 0.0
        },
        {
            "foo": 0.25
        },
        {
            "foo": 0.75
        },
        {
            "foo": 1.0
        },
    ]
def test_filter_by_score(record_and_meta_2):
    entity_xf_list = [
        # Replace names with PERSON_NAME. Should be applied to all.
        RedactWithLabelConfig(labels=['person_name'], minimum_score=Score.HIGH),

        # Replace names with XXXX. Should be applied to Qualcomm but not Gretel.
        RedactWithCharConfig(labels=['company_name'], minimum_score=Score.HIGH),

        # Replace names with LOCATION_CITY. Should be applied to San Diego.
        RedactWithLabelConfig(labels=['location_city']),
    ]
    data_paths = [
        DataPath(input='summary', xforms=entity_xf_list),
        # Transforms should be no-ops for all these, no matching entities.
        DataPath(input='dni', xforms=entity_xf_list),
        DataPath(input='city', xforms=entity_xf_list),
        DataPath(input='state', xforms=entity_xf_list),
        DataPath(input='stuff', xforms=entity_xf_list),
        DataPath(input='latitude', xforms=entity_xf_list)
    ]
    xf = DataTransformPipeline(data_paths)
    check = xf.transform_record(record_and_meta_2).get('record')
    assert check == {
        'summary': 'PERSON_NAME <*****@*****.**> works at Gretel. PERSON_NAME used to work at '
                   'XXXXXXXX.',
        'dni': 'He loves 8.8.8.8 for DNS',
        'city': 'LOCATION_CITY',
        'state': 'California',
        'stuff': 'nothing labeled here',
        'latitude': 112.221
    }
def test_record_fpe_base62():
    rec = {'latitude': -70.783, 'longitude': -112.221, 'credit_card': '4123567891234567', 'the_dude': 100000000,
           'the_hotness': "This is some awesome text with UPPER and lower case characters.",
           "the_sci_notation": 1.23E-7}
    numbers_xf = [FpeStringConfig(
        secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94", radix=10)]

    float_xf = [FpeFloatConfig(
        secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94", radix=10,
        float_precision=3)]
    cc_xf = [FormatConfig(pattern=r'\s+', replacement=''),
             FpeStringConfig(
                 secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94", radix=10)]

    text_xf = [
        FpeStringConfig(secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94", radix=62)]

    data_paths = [DataPath(input='credit_card', xforms=cc_xf),
                  DataPath(input='longitude', xforms=float_xf),
                  DataPath(input='latitude', xforms=float_xf),
                  DataPath(input='the_dude', xforms=numbers_xf),
                  DataPath(input='the_sci_notation', xforms=float_xf),
                  DataPath(input='the_hotness', xforms=text_xf)
                  ]
    xf = DataTransformPipeline(data_paths)
    rf = DataRestorePipeline(data_paths)
    xf_payload = xf.transform_record(rec)
    check = xf_payload.get('credit_card')
    assert check == '5931468769662449'
    check = rf.transform_record(xf_payload)
    assert check == rec
def test_pipe_record_filter(record_meta_data_check):
    entity_xf = [
        RedactWithLabelConfig(labels=['date']),
        SecureHashConfig(secret='rockybalboa', labels=['location']),
        FpeStringConfig(labels=['credit_card_number'],
                        secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94",
                        radix=10)
    ]
    data_paths = [
        DataPath(input='Country', xforms=entity_xf),
        DataPath(input='?ddress', xforms=entity_xf),
        DataPath(input='Cr*', xforms=entity_xf)
    ]
    xf = DataTransformPipeline(data_paths)
    rf = DataRestorePipeline(data_paths)
    transformed = xf.transform_record(record_meta_data_check)
    assert transformed['record']['Credit Card'] == '4471585942734458'
    assert transformed['metadata']['fields']['Credit Card']['ner']['labels'][0]['text'] == '4471585942734458'
    assert transformed['metadata']['fields']['Country']['ner']['labels'][0]['start'] == 0
    assert transformed['metadata']['fields']['Country']['ner']['labels'][0]['end'] == 64
    # The metadata has one entry less than record entries, because Address does not have meta data in this test.
    assert len(transformed['metadata']['fields']) == 2
    assert len(transformed['record']) == 3
    restored = rf.transform_record(transformed)
    assert restored['record']['Credit Card'] == record_meta_data_check['record']['Credit Card']
Ejemplo n.º 7
0
def test_type_error():
    tup = BucketCreationParams(0.0, 1.0, 0.5)
    buckets = bucket_creation_params_to_list(tup)
    paths = [DataPath(input="foo", xforms=BucketConfig(buckets=buckets))]
    pipe = DataTransformPipeline(paths)
    r = {"foo": "bar"}
    # String throws a TypeError.  We catch it and return original record.
    assert r == pipe.transform_record(r)
def test_combine(record_and_meta_2):
    xf_combine = CombineConfig(combine=FieldRef(['latitude', 'city', 'state']), separator=", ")

    data_paths = [
        DataPath(input='dni', xforms=xf_combine, output='everything'),
    ]

    xf = DataTransformPipeline(data_paths)

    check_aw = xf.transform_record(record_and_meta_2)
    assert check_aw['record'] == {'everything': 'He loves 8.8.8.8 for DNS, 112.221, San Diego, California'}
def test_pipe_bucket(records_date_tweak):
    bucket_list = [Bucket('A', 'L', 'A-L'), Bucket('M', 'Z', 'M-Z')]
    bucket_xf = BucketConfig(buckets=bucket_list)

    data_paths = [DataPath(input='last_name', xforms=bucket_xf),
                  DataPath(input='*')]

    xf = DataTransformPipeline(data_paths)
    check_aw = xf.transform_record(records_date_tweak[0])
    check_ae = xf.transform_record(records_date_tweak[1])
    assert check_aw['last_name'] == 'M-Z'
    assert check_ae['last_name'] == 'A-L'
def test_pipe_combine(records_date_tweak):
    xf_combine = CombineConfig(combine=FieldRef(['first_name', 'city', 'state']), separator=", ")

    data_paths = [
        DataPath(input='last_name', xforms=xf_combine, output='name_location'),
    ]

    xf = DataTransformPipeline(data_paths)

    check_aw = xf.transform_record(records_date_tweak[0])
    assert check_aw == {'name_location': 'Watson, Alex, San Diego, California'}
    check_ae = xf.transform_record(records_date_tweak[1])
    assert check_ae == {'name_location': 'Ehrath, Alex, San Marcos, California'}
def test_fpe_dirty_transform(record_dirty_fpe_check):
    field_xf = FpeStringConfig(secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94", radix=10)
    data_paths = [
        DataPath(input='Credit Card', xforms=field_xf),
        DataPath(input='Customer ID', xforms=field_xf),
        DataPath(input='*')
    ]
    xf = DataTransformPipeline(data_paths)
    rf = DataRestorePipeline(data_paths)
    transformed = xf.transform_record(record_dirty_fpe_check)
    assert transformed['Credit Card'] == '447158 5942734 458'
    assert transformed['Customer ID'] == '747/52*232 83-19'
    restored = rf.transform_record(transformed)
    assert restored == record_dirty_fpe_check
def test_redact_with_string(record_and_meta_2):
    xf_redact_field = RedactWithStringConfig(string="DON'T_SHOW_THIS_FIELD")
    xf_redact_entity = RedactWithStringConfig(labels=['ip_address'], string="DON'T_SHOW_THIS_ENTITY")

    data_paths = [
        DataPath(input='city', xforms=xf_redact_field),
        DataPath(input='*', xforms=xf_redact_entity)
    ]

    xf = DataTransformPipeline(data_paths)

    check = xf.transform_record(record_and_meta_2)
    assert check['record']['dni'] == 'He loves DON\'T_SHOW_THIS_ENTITY for DNS'
    assert check['record']['city'] == "DON'T_SHOW_THIS_FIELD"
def test_metadata_in_xf(record_meta_data_check):
    path = DataPath(input="*", xforms=[RedactWithLabelConfig()])

    with patch.object(
        path.transformations[0],
        "_transform_field",
        wraps=path.transformations[0]._transform_field,
    ) as xf_fn:
        xf = DataTransformPipeline([path])
        xf.transform_record(record_meta_data_check)
        assert (
            xf_fn.call_args_list[0][0][2]["gretel_id"]
            == record_meta_data_check["metadata"]["gretel_id"]
        )
def test_date_shift_format():
    xf_date = DateShiftConfig(
        secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94",
        lower_range_days=-10,
        upper_range_days=25,
        date_format='%m/%d/%Y',
        tweak=FieldRef("user_id")
    )
    data_paths = [DataPath(input="birthday", xforms=xf_date), DataPath(input="*")]
    pipe = DataTransformPipeline(data_paths)
    restore_pipe = DataRestorePipeline(data_paths)
    records = [
        {"user_id": "*****@*****.**", "birthday": "02/17/1963"},
        {"user_id": "*****@*****.**", "birthday": "06/09/1961"},
        {"user_id": "*****@*****.**", "birthday": "08/29/1958"},
    ]
    out = [pipe.transform_record(rec) for rec in records]
    assert out == [
        {"user_id": "*****@*****.**", "birthday": "02/13/1963"},
        {"user_id": "*****@*****.**", "birthday": "06/05/1961"},
        {"user_id": "*****@*****.**", "birthday": "08/25/1958"},
    ]
    restored = [restore_pipe.transform_record(rec) for rec in out]
    assert restored == [
        {"user_id": "*****@*****.**", "birthday": "02/17/1963"},
        {"user_id": "*****@*****.**", "birthday": "06/09/1961"},
        {"user_id": "*****@*****.**", "birthday": "08/29/1958"},
    ]
    records = [
        {"user_id": "*****@*****.**", "birthday": "1963-02-17"},
        {"user_id": "*****@*****.**", "birthday": "1961-06-09"},
        {"user_id": "*****@*****.**", "birthday": "1958-08-29"},
    ]
    out = [pipe.transform_record(rec) for rec in records]
    assert out == [
        {"user_id": "*****@*****.**", "birthday": "02/13/1963"},
        {"user_id": "*****@*****.**", "birthday": "06/05/1961"},
        {"user_id": "*****@*****.**", "birthday": "08/25/1958"},
    ]
    restored = [restore_pipe.transform_record(rec) for rec in out]
    # Please note the format!
    assert restored == [
        {"user_id": "*****@*****.**", "birthday": "02/17/1963"},
        {"user_id": "*****@*****.**", "birthday": "06/09/1961"},
        {"user_id": "*****@*****.**", "birthday": "08/29/1958"},
    ]
def test_meta_data_transform(record_meta_data_check):
    entity_xf = [
        RedactWithLabelConfig(labels=['date']),
        SecureHashConfig(secret='rockybalboa', labels=['location']),
        FpeStringConfig(labels=['credit_card_number'],
                        secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94",
                        radix=10)
    ]
    data_paths = [DataPath(input='*', xforms=entity_xf)]
    xf = DataTransformPipeline(data_paths)
    rf = DataRestorePipeline(data_paths)
    transformed = xf.transform_record(record_meta_data_check)
    assert transformed['record']['Credit Card'] == '4471585942734458'
    assert transformed['metadata']['fields']['Credit Card']['ner']['labels'][0]['text'] == '4471585942734458'
    assert transformed['metadata']['fields']['Country']['ner']['labels'][0]['start'] == 0
    assert transformed['metadata']['fields']['Country']['ner']['labels'][0]['end'] == 64
    restored = rf.transform_record(transformed)
    assert restored['record']['Credit Card'] == record_meta_data_check['record']['Credit Card']
def test_gretel_meta(record_and_meta_2):
    xf_fpe = FpeStringConfig(secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94", radix=10)
    xf_redact_entity = FpeStringConfig(labels=['ip_address'],
                                       secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94",
                                       radix=10)

    data_paths = [
        DataPath(input='latitude', xforms=xf_fpe),
        DataPath(input='*', xforms=xf_redact_entity)
    ]

    xf = DataTransformPipeline(data_paths)
    rf = DataRestorePipeline(data_paths)
    check = xf.transform_record(record_and_meta_2)
    assert check['metadata']['gretel_id'] == '2732c7ed44a8402f899a01e52a931985'
    check = rf.transform_record(check)
    assert check['record'] == record_and_meta_2['record']
    assert check['metadata']['gretel_id'] == '2732c7ed44a8402f899a01e52a931985'
Ejemplo n.º 17
0
def test_bucket(safecast_test_bucket2):
    bucket_list = [
        Bucket(20.0, 23.0, "Low"),
        Bucket(23.0, 24.0, "Med"),
        Bucket(24.0, 25.0, "High"),
    ]
    bucket_config = BucketConfig(buckets=bucket_list)
    data_paths = [
        DataPath(input="payload.env_temp", xforms=bucket_config),
        DataPath(input="*"),
    ]
    xf = DataTransformPipeline(data_paths)
    recs = []

    for rec in safecast_test_bucket2.get("data", {}).get("records"):
        recs.append(dict(xf.transform_record(rec.get("data"))))
    assert recs[0]["payload.env_temp"] == "Low"
    assert recs[4]["payload.env_temp"] == "Med"
    assert recs[7]["payload.env_temp"] == "High"
def test_record_fpe_precision():
    rec = {'latitude': -70.783, 'longitude': -112.221, 'credit_card': '4123567891234567', 'the_dude': 100000000,
           'the_hotness': "convertme", "the_sci_notation": 1.23E-7}
    int_xf = FpeStringConfig(secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94", radix=10)

    num1_xf = FpeFloatConfig(secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94", radix=10,
                             float_precision=1)

    num2_xf = FpeFloatConfig(secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94", radix=10,
                             float_precision=0)

    num3_xf = FpeFloatConfig(secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94", radix=10,
                             float_precision=1)

    num4_xf = FpeStringConfig(secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94", radix=36)

    data_paths = [
        DataPath(input='credit_card', xforms=int_xf),
        DataPath(input='latitude', xforms=num1_xf),
        DataPath(input='the_dude', xforms=int_xf),
        DataPath(input='longitude', xforms=num2_xf),
        DataPath(input='the_sci_notation', xforms=num3_xf),
        DataPath(input='the_hotness', xforms=num4_xf),
        DataPath(input='*')
    ]
    xf = DataTransformPipeline(data_paths)
    rf = DataRestorePipeline(data_paths)
    xf_payload = xf.transform_record(rec)
    check = xf_payload.get('credit_card')
    assert check == '5931468769662449'
    check = xf_payload.get('longitude')
    assert check == -112.2929577756414
    check = xf_payload.get('latitude')
    assert check == -70.78143312456855
    check = xf_payload.get('the_hotness')
    assert check == '2qjuxg7ju'
    check = xf_payload.get('the_dude')
    assert check == 128994144
    check = xf_payload.get('the_sci_notation')
    assert check == 1.2342967235924508e-07
    check = rf.transform_record(xf_payload)
    assert check == rec
def test_record_fpe_mask():
    rec = {'latitude': -70.783, 'longitude': -112.221, 'credit_card': '4123 5678 9123 4567', 'the_dude': 100000000,
           'the_hotness': "convertme", "the_sci_notation": 1.23E-7}
    mask = StringMask(start_pos=1)
    cc_xf = [FormatConfig(pattern=r'\s+', replacement=''),
             FpeStringConfig(
                 secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94", radix=10,
                 mask=[mask])]

    data_paths = [DataPath(input='credit_card', xforms=cc_xf)]
    xf = DataTransformPipeline(data_paths)
    rf = DataRestorePipeline(data_paths)
    xf_payload = xf.transform_record(rec)
    check = xf_payload.get('credit_card')
    assert check == '4599631908097107'
    rf_payload = rf.transform_record(xf_payload)
    check = rf_payload.get('credit_card')
    assert check == '4123567891234567'
    cc_xf = [FpeStringConfig(
        secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94", radix=10,
        mask=[mask])]
    data_paths = [DataPath(input='credit_card', xforms=cc_xf)]
    xf = DataTransformPipeline(data_paths)
    rf = DataRestorePipeline(data_paths)
    xf_payload = xf.transform_record(rec)
    check = xf_payload.get('credit_card')
    assert check == '4599 6319 0809 7107'
    rf_payload = rf.transform_record(xf_payload)
    check = rf_payload.get('credit_card')
    assert check == '4123 5678 9123 4567'
def test_record_zero_fpe():
    rec = {'latitude': 0.0, 'longitude': -0.0, 'credit_card': '4123567891234567', 'the_dude': 100000000,
           'the_hotness': "convertme", "the_sci_notation": 1.23E-7}
    numbers_xf = [FpeStringConfig(
        secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94", radix=10)]

    float_xf = [FpeFloatConfig(
        secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94", radix=10,
        float_precision=3)]
    text_xf = [
        FpeStringConfig(secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94", radix=36)]

    data_paths = [
        DataPath(input='credit_card', xforms=numbers_xf),
        DataPath(input='latitude', xforms=float_xf),

        DataPath(input='longitude', xforms=float_xf),
        DataPath(input='the_dude', xforms=numbers_xf),
        DataPath(input='the_sci_notation', xforms=float_xf),
        DataPath(input='the_hotness', xforms=text_xf)
    ]
    xf = DataTransformPipeline(data_paths)
    rf = DataRestorePipeline(data_paths)
    xf_payload = xf.transform_record(rec)
    check = xf_payload.get('credit_card')
    assert check == '5931468769662449'
    check = xf_payload.get('longitude')
    assert check == -1.32547939979e-312
    check = xf_payload.get('latitude')
    assert check == 1.32547939979e-312
    check = xf_payload.get('the_hotness')
    assert check == '2qjuxg7ju'
    check = xf_payload.get('the_dude')
    assert check == 128994144
    check = xf_payload.get('the_sci_notation')
    assert check == 1.229570610794763e-07
    check = rf.transform_record(xf_payload)
    assert check == rec
def test_record_output_map_and_schemas():
    rec = {'a': 1.23, 'b': 2.34, 'c': 3.45, 'd': 4.56, 'e': 5.67}
    rec2 = {'f': 1.23, 'b': 2.34, 'c': 3.45, 'd': 4.56, 'e': 5.67}
    test_payloads = [(rec, record_key) for record_key in RECORD_KEYS]
    test_payloads.append((rec, None))
    for payload, record_key in test_payloads:
        xf_list = FpeFloatConfig(secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94", radix=10)

        data_paths = [
            DataPath(input='a', output='x'),
            DataPath(input='b', output='y'),
            DataPath(input='c', xforms=xf_list, output='z'),
            DataPath(input='d', xforms=xf_list),
            DataPath(input='e', xforms=xf_list),
            DataPath(input='*')
        ]
        xf = DataTransformPipeline(data_paths)
        rf = DataRestorePipeline(data_paths)
        xf_payload = xf.transform_record(payload)
        xf_record = xf_payload.get(record_key) or xf_payload
        check = xf_record.get('x')
        assert check == 1.23
        check = xf_record.get('y')
        assert check == 2.34
        check = xf_record.get('z')
        assert check == 3.590038584114511
        check = xf_record.get('d')
        assert check == 7.002521213914073
        check = xf_record.get('e')
        assert check == 4.9570355284951875
        check = rf.transform_record(xf_payload)
        check = check.get(record_key) or check
        assert check == rec

        # test multiple names mapping to the same output field
        xf_list = FpeStringConfig(secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94", radix=10)
        data_paths = [
            DataPath(input='a', xforms=xf_list, output='x'),
            DataPath(input='f', xforms=xf_list, output='x'),
            DataPath(input='b', xforms=xf_list, output='y'),
            DataPath(input='c', xforms=xf_list, output='z'),
            DataPath(input='*')
        ]
        xf = DataTransformPipeline(data_paths)
        xf_payload = xf.transform_record(rec)
        xf_payload2 = xf.transform_record(rec2)
        xf_record = xf_payload.get(record_key) or xf_payload
        xf_record2 = xf_payload2.get(record_key) or xf_payload2

        assert xf_record == xf_record2
def stream_table_view(
    data: dict,
    xf: DataTransformPipeline = None,
    sleep: float = 0.0,
    title: str = None,
    title_color: str = "black",
    clear: bool = False,
):
    """
    Stream a table view into a Jupyter cell
    """
    if xf:
        transformed = xf.transform_record(data)
        df = pd.DataFrame.from_dict(transformed["record"],
                                    orient="index",
                                    columns=["field"])
        df["tags"] = ""
        for field, value in transformed["record"].items():
            if field in data["record"].keys():
                if value != data["record"][field]:
                    df.at[field, "tags"] = "Transformed"
                else:
                    field_data = data["metadata"]["fields"].get(
                        str(field), {"ner": {
                            "labels": []
                        }})
                    labels = ", ".join(
                        [x["label"] for x in field_data["ner"]["labels"]])
                    df.at[field, "tags"] = labels
            else:
                df.at[field, "tags"] = "Transformed"
    else:
        # Gretel format record +
        df = pd.DataFrame.from_dict(data["record"],
                                    orient="index",
                                    columns=["field"])
        df["tags"] = ""
        for field in list(df.index):
            field_data = data["metadata"]["fields"].get(
                str(field), {"ner": {
                    "labels": []
                }})
            labels = ", ".join(
                [x["label"] for x in field_data["ner"]["labels"]])
            df.at[field, "tags"] = labels
    display_df(df, sleep, clear, title, title_color)
def test_pipe_date_shift_cbc_fast(records_date_tweak):
    # run tests with user_id to tweak the de-identified date

    xf_user_id = FpeStringConfig(secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94", radix=10,
                                 aes_mode=crypto_aes.Mode.CBC_FAST)
    xf_date = DateShiftConfig(secret='2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94',
                              lower_range_days=-10, upper_range_days=25,
                              tweak=FieldRef('user_id'))

    data_paths = [DataPath(input='user_id', xforms=xf_user_id),
                  DataPath(input='created', xforms=xf_date),
                  DataPath(input='*')
                  ]

    xf = DataTransformPipeline(data_paths)
    rf = DataRestorePipeline(data_paths)
    check_aw = xf.transform_record(records_date_tweak[0])
    check_ae = xf.transform_record(records_date_tweak[1])
    assert check_aw['created'] == '2016-06-18'
    assert check_ae['created'] == '2016-06-18'
    check_ae = rf.transform_record(check_ae)
    check_aw = rf.transform_record(check_aw)
    assert check_aw['created'] == '2016-06-17'
    assert check_ae['created'] == '2016-06-17'

    # run tests without tweaking the de-identified date
    xf_date = DateShiftConfig(secret='2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94',
                              lower_range_days=-10, upper_range_days=25)

    data_paths = [DataPath(input='created', xforms=xf_date)]

    xf = DataTransformPipeline(data_paths)
    rf = DataRestorePipeline(data_paths)
    check_aw = xf.transform_record(records_date_tweak[0])
    check_ae = xf.transform_record(records_date_tweak[1])
    assert check_aw['created'] == '2016-06-13'
    assert check_ae['created'] == '2016-06-13'
    record_and_meta_aw = check_aw
    record_and_meta_ae = check_ae
    check_aw = rf.transform_record(record_and_meta_aw)
    check_ae = rf.transform_record(record_and_meta_ae)
    assert check_aw['created'] == '2016-06-17'
    assert check_ae['created'] == '2016-06-17'
"""
Data pipeline only forwarding 2 fields, one of which get's renamed.
"""
from gretel_client.transformers import DataPath, DataTransformPipeline


paths = [
    DataPath(input="trash", output="new_trash"),
    DataPath(input="foo")
]

pipe = DataTransformPipeline(paths)

rec = {
    "foo": "hello",
    "trash": "old fish",
    "trash_again": "bad milk"
}

# Time to take out the trash
out = pipe.transform_record(rec)

assert out == {
    "foo": "hello",
    "new_trash": "old fish"
}

print(out)
Ejemplo n.º 25
0
        "activity": "Wedding Crasher",
        "guest": "Bobby O'Shea",
        "location": "Baltimore"
    },
]

guest_xf = FakeConstantConfig(seed=SEED, fake_method="name")
location_xf = FakeConstantConfig(seed=SEED, fake_method="city")

paths = [
    DataPath(input="guest", xforms=[guest_xf]),
    DataPath(input="location", xforms=[location_xf]),
    DataPath(input="*"),
]

pipe = DataTransformPipeline(paths)

results = []

for record in SOURCE:
    results.append(pipe.transform_record(record))

assert results == [
    {
        "activity": "Wedding Crasher",
        "guest": "Sean Johnson",
        "location": "Smithtown"
    },
    {
        "activity": "Wedding Crasher",
        "guest": "Christopher Obrien",
    RedactWithCharConfig,
    DataPath,
    DataTransformPipeline,
    StringMask,
)

xf = [RedactWithCharConfig()]
xf2 = [RedactWithCharConfig(char="Y")]

paths = [
    DataPath(input="foo", xforms=xf),
    DataPath(input="bar", xforms=xf2),
    DataPath(input="*"),
]

pipe = DataTransformPipeline(paths)

rec = {"foo": "hello", "bar": "there", "baz": "world"}

out = pipe.transform_record(rec)

assert out == {"foo": "XXXXX", "bar": "YYYYY", "baz": "world"}

print(out)

# Now let's do partial redactions

mask_1 = StringMask(
    start_pos=3)  # let's only keep the first few chars of an email address
mask_2 = StringMask(
    mask_after="@")  # let's only mask the domain part of the email address
Ejemplo n.º 27
0
def test_bucket2(safecast_test_bucket2):
    bucket_list = [
        Bucket(22.0, 23.0, "FEET_0"),
        Bucket(23.0, 24.0, "FEET_1"),
        Bucket(24.0, 25.0, "FEET_2"),
    ]
    bucket_config = [
        BucketConfig(buckets=bucket_list,
                     lower_outlier_label="YEET",
                     upper_outlier_label="WOOT")
    ]
    data_paths = [
        DataPath(input="payload.env_temp", xforms=bucket_config),
        DataPath(input="*"),
    ]
    xf = DataTransformPipeline(data_paths)

    recs = []
    for rec in safecast_test_bucket2.get("data", {}).get("records"):
        recs.append(
            xf.transform_record(rec.get("data")).get("payload.env_temp"))
    assert recs == [
        "YEET",
        None,
        None,
        None,
        "FEET_1",
        None,
        None,
        "WOOT",
        None,
        None,
        None,
    ]

    bucket_list = [
        Bucket(21.0, 22.0, "nice"),
        Bucket(22.0, 23.0, "bearable"),
        Bucket(23.0, 24.0, "toasty"),
        Bucket(24.0, 25.0, "volcano"),
        Bucket(25.0, 26.0, "nuke"),
    ]
    bucket_config = BucketConfig(buckets=bucket_list)
    data_paths = [
        DataPath(input="payload.env_temp", xforms=bucket_config),
        DataPath(input="*"),
    ]
    xf = DataTransformPipeline(data_paths)
    recs = []
    for rec in safecast_test_bucket2.get("data", {}).get("records"):
        recs.append(
            xf.transform_record(rec.get("data")).get("payload.env_temp"))
    assert recs == [
        "nice",
        None,
        None,
        None,
        "toasty",
        None,
        None,
        "nuke",
        None,
        None,
        None,
    ]
def test_record_xf(record_and_meta_2):
    # empty transformer
    entity_xf_list = [
        # replace names with PERSON_NAM
        RedactWithLabelConfig(labels=['person_name']),

        # swap emails with fake (but consistent emails)
        FakeConstantConfig(labels=['email_address'], seed=SEED),

        # character-redact IP addresses
        RedactWithCharConfig(labels=['ip_address']),

        # this should not be run
        RedactWithCharConfig(char='N', labels=['location_city']),

        # secure hash
        SecureHashConfig(secret='rockybalboa', labels=['location_state']),

        # replace latitude
        FakeConstantConfig(labels=['latitude'], seed=SEED)
    ]
    # field redact entire city
    city_redact = RedactWithCharConfig(char='Y')

    data_paths = [
        DataPath(input='summary', xforms=entity_xf_list),
        DataPath(input='dni', xforms=entity_xf_list),
        DataPath(input='city', xforms=[entity_xf_list, city_redact]),
        DataPath(input='state', xforms=entity_xf_list),
        DataPath(input='stuff', xforms=entity_xf_list),
        DataPath(input='latitude', xforms=entity_xf_list)
    ]

    xf = DataTransformPipeline(data_paths)

    check1 = xf.transform_record(record_and_meta_2).get('record')

    data_paths = [
        DataPath(input='city', xforms=[entity_xf_list, city_redact]),
        DataPath(input='*', xforms=entity_xf_list),
    ]

    xf = DataTransformPipeline(data_paths)

    check2 = xf.transform_record(record_and_meta_2).get('record')

    assert check1 == {
        'summary': 'PERSON_NAME <*****@*****.**> works at Gretel. PERSON_NAME used to work at '
                   'Qualcomm.',
        'dni': 'He loves X.X.X.X for DNS',
        'city': 'YYY YYYYY',
        'state': '8896cd9f38ceac0e98f47c41a2028219f17d8ef41277e4e2138d52a08c24e0aa',
        'stuff': 'nothing labeled here',
        'latitude': -89.3146475}

    assert check2 == {
        'summary': 'PERSON_NAME <*****@*****.**> works at Gretel. PERSON_NAME used to work at '
                   'Qualcomm.',
        'dni': 'He loves X.X.X.X for DNS',
        'city': 'YYY YYYYY',
        'state': '8896cd9f38ceac0e98f47c41a2028219f17d8ef41277e4e2138d52a08c24e0aa',
        'stuff': 'nothing labeled here',
        'latitude': -89.3146475}
    # now add a drop field that contains an entity
    entity_xf_list.insert(0, DropConfig(labels=['ip_address']))

    data_paths = [
        DataPath(input='summary', xforms=entity_xf_list),
        DataPath(input='dni', xforms=entity_xf_list),
        DataPath(input='city', xforms=[entity_xf_list, city_redact]),
        DataPath(input='state', xforms=entity_xf_list),
        DataPath(input='stuff', xforms=entity_xf_list),
        DataPath(input='latitude', xforms=entity_xf_list)
    ]

    xf = DataTransformPipeline(data_paths)

    check = xf.transform_record(record_and_meta_2).get('record')

    assert check == {
        'summary': 'PERSON_NAME <*****@*****.**> works at Gretel. PERSON_NAME used to work at '
                   'Qualcomm.',
        'city': 'YYY YYYYY',
        'state': '8896cd9f38ceac0e98f47c41a2028219f17d8ef41277e4e2138d52a08c24e0aa',
        'stuff': 'nothing labeled here',
        'latitude': -89.3146475}
Ejemplo n.º 29
0
# NOTE: the ``label_method`` defaults to "min", but here we use "avg"
buckets = bucket_creation_params_to_list(min_max_width_tuple,
                                         label_method="avg")

numeric_bucketing_xf = BucketConfig(
    buckets=buckets,
    lower_outlier_label=0.0,
    upper_outlier_label=3.5,
)

paths = [
    DataPath(input="score", xforms=numeric_bucketing_xf),
    DataPath(input="*")
]

pipe = DataTransformPipeline(paths)

records = [
    {
        "foo": "hello",
        "score": 1.234
    },
    {
        "score": 2.234
    },
    {
        "foo": "hello",
        "score": 3.234
    },
    {
        "score": -1.234
def test_conditional_transformer(records_conditional):
    xf_fpe = FpeFloatConfig(secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94", radix=10)
    xf_consent = ConditionalConfig(conditional_value=FieldRef('user_consent'), regex=r"['1']",
                                   true_xform=xf_fpe,
                                   false_xform=RedactWithLabelConfig())

    data_paths_encrypt = [DataPath(input='lon', xforms=xf_fpe),
                          DataPath(input='lat', xforms=xf_fpe),
                          DataPath(input='*')
                          ]

    data_paths_decrypt = [DataPath(input='lon', xforms=xf_consent),
                          DataPath(input='lat', xforms=xf_consent),
                          DataPath(input='*')
                          ]

    xf_encrypt = DataTransformPipeline(data_paths_encrypt)
    xf_decrypt = DataRestorePipeline(data_paths_decrypt)
    check_aw = xf_encrypt.transform_record(records_conditional[0])
    check_ae = xf_encrypt.transform_record(records_conditional[1])
    assert check_ae['record']['lat'] == 50.65564864394322
    assert check_ae['record']['lon'] == 191.8142181740291
    assert check_aw['record']['lat'] == 77.00217823076872
    assert check_aw['record']['lon'] == 254.0404040486477
    check_aw = xf_decrypt.transform_record(check_aw)
    check_ae = xf_decrypt.transform_record(check_ae)
    assert check_ae['record']['lat'] == 'LATITUDE'
    assert check_ae['record']['lon'] == 'LONGITUDE'
    assert check_aw['record']['lat'] == 112.22134
    assert check_aw['record']['lon'] == 135.76433

    xf_fpe = FpeFloatConfig(secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94", radix=10)
    xf_consent = ConditionalConfig(conditional_value=FieldRef('user_consent'), regex=r"['1']",
                                   true_xform=xf_fpe)

    data_paths_encrypt = [DataPath(input='lon', xforms=xf_fpe),
                          DataPath(input='lat', xforms=xf_fpe),
                          DataPath(input='*')
                          ]

    data_paths_decrypt = [DataPath(input='lon', xforms=xf_consent),
                          DataPath(input='lat', xforms=xf_consent),
                          DataPath(input='*')
                          ]

    xf_encrypt = DataTransformPipeline(data_paths_encrypt)
    xf_decrypt = DataRestorePipeline(data_paths_decrypt)
    check_aw = xf_encrypt.transform_record(records_conditional[0])
    check_ae = xf_encrypt.transform_record(records_conditional[1])
    assert check_ae['record']['lat'] == 50.65564864394322
    assert check_ae['record']['lon'] == 191.8142181740291
    assert check_aw['record']['lat'] == 77.00217823076872
    assert check_aw['record']['lon'] == 254.0404040486477
    check_aw = xf_decrypt.transform_record(check_aw)
    check_ae = xf_decrypt.transform_record(check_ae)
    assert check_ae['record']['lat'] == 50.65564864394322
    assert check_ae['record']['lon'] == 191.8142181740291
    assert check_aw['record']['lat'] == 112.22134
    assert check_aw['record']['lon'] == 135.76433

    xf_fpe = FpeFloatConfig(secret="2B7E151628AED2A6ABF7158809CF4F3CEF4359D8D580AA4F7F036D6F04FC6A94", radix=10)
    xf_consent = ConditionalConfig(conditional_value=FieldRef('user_consent'), regex=r"['1']",
                                   false_xform=xf_fpe)

    data_paths_decrypt = [DataPath(input='lon', xforms=xf_consent),
                          DataPath(input='lat', xforms=xf_consent),
                          DataPath(input='*')
                          ]

    xf_decrypt = DataRestorePipeline(data_paths_decrypt)
    check_aw = xf_decrypt.transform_record(check_aw)
    check_ae = xf_decrypt.transform_record(check_ae)
    assert check_ae['record']['lat'] == 35.659491
    assert check_ae['record']['lon'] == 139.72785
    assert check_aw['record']['lat'] == 112.22134
    assert check_aw['record']['lon'] == 135.76433