Beispiel #1
0
def test_reversible_tokenizer():
    sess = utils.make_session("test.tokenizer.reversibleTokenizer")
    key = b"5" * 32
    plaintext = pd.DataFrame({"name": ["Alice", "Bob"]})

    tokenized = _apply_tokenizer(
        sess,
        plaintext,
        tkn.ReversibleTokenizer(key=key),
        col_to_rename="to_token(name)",
    )
    tokenized_expected = pd.DataFrame({
        "name": [
            "c8c7e80144304276183e5bcd589db782bc5ff95309",
            "e0f40aea0d5c21b35967c4231b98b5b3e5338e",
        ]
    })
    pdt.assert_frame_equal(tokenized, tokenized_expected)

    recovered = _apply_tokenizer(
        sess,
        tokenized,
        tkn.TokenReverser(key=key),
        col_to_rename="from_token(name)",
    )
    pdt.assert_frame_equal(recovered, plaintext)
def test_tokenizer_with_max_token_len():
    sess = utils.make_session("test.tokenizer.maxTokenLen")
    test_df = pd.DataFrame({"name": ["Alice", "Bob"]})
    expected = pd.DataFrame({"name": ["70a4b1a987", "dd4532a296"]})
    max_token_len = 10
    key = "secret_key"
    df = _make_and_apply_tokenizer(sess, test_df, max_token_len=max_token_len, key=key)
    pdt.assert_frame_equal(df, expected)
def test_tokenizer_is_linkable():
    sess = utils.make_session("test.tokenizer.isLinkable")
    test_df = pd.DataFrame({"name": ["Alice", "Bob"]})
    key1 = "secret_key"
    key2 = "secret_key"
    df1 = _make_and_apply_tokenizer(sess, test_df, max_token_len=None, key=key1)
    df2 = _make_and_apply_tokenizer(sess, test_df, max_token_len=None, key=key2)
    pdt.assert_frame_equal(df1, df2)
Beispiel #4
0
def test_truncate_date():
    sess = utils.make_session("test.truncation.date")
    test_df, expected = _make_date_data(sess)
    truncate = rnd.DateTruncation("month")
    result_df = test_df.select(truncate(test_df.data)).toPandas()
    result = result_df.values
    assert result.dtype == expected.dtype
    np.testing.assert_equal(result, expected)
Beispiel #5
0
def test_rounding_long():
    precision = -2
    sess = utils.make_session("test.rounding.integer")
    test_df, expected = _make_integer_data(np.int64, precision)
    result_df = _make_and_apply_rounder(sess, test_df, dtypes.Long, precision)
    result = result_df.values
    assert result.dtype == expected.dtype
    np.testing.assert_almost_equal(result, expected)
Beispiel #6
0
def test_rounding_float():
    precision = 0
    sess = utils.make_session("test.rounding.float")
    test_df, expected = _make_float_data(np.float32, precision)
    result_df = _make_and_apply_rounder(sess, test_df, dtypes.Float, precision)
    result = result_df.values
    assert result.dtype == expected.dtype
    np.testing.assert_almost_equal(result, expected)
Beispiel #7
0
def test_column_redact():
    sess = utils.make_session("test.redaction.column")
    df = pd.DataFrame(np.ones((5, 3)), columns=["a", "b", "c"])
    expected = pd.DataFrame(np.ones((5, )), columns=["a"])
    test_df = sess.createDataFrame(df, schema=["a", "b", "c"])
    redact = rdc.ColumnRedact(["b", "c"])
    result = redact(test_df).toPandas()
    pdt.assert_frame_equal(result, expected)
Beispiel #8
0
def test_tokenizer_no_key():
    sess = utils.make_session("test.tokenizer.maxTokenLen")
    test_df = pd.DataFrame({"name": ["Alice", "Bob"]})
    _apply_tokenizer(
        sess,
        test_df,
        tkn.Tokenizer(max_token_len=None, key=None),
        col_to_rename="to_token(name)",
    )
Beispiel #9
0
def test_row_redact():
    sess = utils.make_session("test.redaction.row")
    df = pd.DataFrame(np.ones((5, 2)), columns=["a", "b"])
    df["a"].iloc[0] = 6
    df["a"].iloc[3] = 6
    expected = pd.DataFrame(np.ones((3, 2)), columns=["a", "b"])
    test_df = sess.createDataFrame(df, schema=["a", "b"])
    redact = rdc.RowRedact("a > 5")
    result = redact(test_df).toPandas()
    pdt.assert_frame_equal(result, expected)
def test_tokenizer_is_not_linkable():
    sess = utils.make_session("test.tokenizer.isNotLinkable")
    test_df = pd.DataFrame({"name": ["Alice", "Bob"]})
    key1 = "secret_key"
    key2 = "not_your_secret_key"
    df1 = _make_and_apply_tokenizer(sess, test_df, max_token_len=None, key=key1)
    df2 = _make_and_apply_tokenizer(sess, test_df, max_token_len=None, key=key2)
    try:
        pdt.assert_frame_equal(df1, df2)
        raise NotImplemented  # noqa: F901
    except AssertionError:
        pass
    except NotImplemented:
        raise AssertionError
def test_tokenizer_simple():
    sess = utils.make_session("test.tokenizer.simple")
    test_df = pd.DataFrame({"name": ["Alice", "Bob"]})
    expected = pd.DataFrame(
        {
            "name": [
                "70a4b1a987767abf36463cd3e3f2b37144132e572fbb9b39f28bcaafe10d9b24",
                "dd4532a296deb4f114b1e7e88faefe4fb2b32c559ac15a8c6bcbdbcbc2aa4d4b",
            ]
        }
    )
    key = "secret_key"
    df = _make_and_apply_tokenizer(sess, test_df, max_token_len=None, key=key)
    pdt.assert_frame_equal(df, expected)
def test_tokenizer_no_key():
    sess = utils.make_session("test.tokenizer.maxTokenLen")
    test_df = pd.DataFrame({"name": ["Alice", "Bob"]})
    _make_and_apply_tokenizer(sess, test_df, max_token_len=None, key=None)