Beispiel #1
0
def test_make_histograms():

    features = [
        'date', 'isActive', 'age', 'eyeColor', 'gender', 'company', 'latitude',
        'longitude', ['isActive', 'age'], ['latitude', 'longitude']
    ]
    bin_specs = {
        'longitude': {
            'bin_width': 5,
            'bin_offset': 0
        },
        'latitude': {
            'bin_width': 5,
            'bin_offset': 0
        }
    }

    current_hists = make_histograms(pytest.test_df,
                                    features=features,
                                    binning='unit',
                                    bin_specs=bin_specs)

    assert current_hists['age'].toJson() == pytest.age
    assert current_hists['company'].toJson() == pytest.company
    assert current_hists['date'].toJson() == pytest.date
    assert current_hists['eyeColor'].toJson() == pytest.eyesColor
    assert current_hists['gender'].toJson() == pytest.gender
    assert current_hists['isActive'].toJson() == pytest.isActive
    assert current_hists['isActive:age'].toJson() == pytest.isActive_age
    assert current_hists['latitude'].toJson() == pytest.latitude
    assert current_hists['longitude'].toJson() == pytest.longitude
    assert current_hists['latitude:longitude'].toJson(
    ) == pytest.latitude_longitude
def test_make_histograms_with_time_axis():

    hists, features, bin_specs, time_axis, var_dtype = make_histograms(
        pytest.test_df, time_axis=True, ret_specs=True)

    assert len(hists) == 20
    assert len(features) == 20
    assert len(bin_specs) == 20
    assert len(var_dtype) == 21
    assert time_axis == "date"
    assert "date:age" in hists
    h = hists["date:age"]
    assert h.binWidth == 751582381944448.0
    for cols in features:
        cols = cols.split(":")
        assert len(cols) == 2 and cols[0] == "date"
    for f, bs in bin_specs.items():
        assert len(bs) == 2
    assert "date:age" in bin_specs
    dateage = bin_specs["date:age"]
    assert dateage[0]["bin_width"] == 751582381944448.0
    assert dateage[1]["bin_width"] == 2.0
    assert dateage[1]["bin_offset"] == 9.5

    # test get_bin_specs 1
    bin_specs = get_bin_specs(hists)
    assert "date:age" in bin_specs
    dateage = bin_specs["date:age"]
    assert dateage[0]["bin_width"] == 751582381944448.0
    assert dateage[1]["bin_width"] == 2.0
    assert dateage[1]["bin_offset"] == 9.5

    # test get_bin_specs 2
    bin_specs = get_bin_specs(hists, skip_first_axis=True)
    assert "age" in bin_specs
    age = bin_specs["age"]
    assert age["bin_width"] == 2.0
    assert age["bin_offset"] == 9.5

    # test get_bin_specs 3
    bin_specs = get_bin_specs(hists["date:age"])
    assert bin_specs[0]["bin_width"] == 751582381944448.0
    assert bin_specs[1]["bin_width"] == 2.0
    assert bin_specs[1]["bin_offset"] == 9.5

    # test get_bin_specs 4
    bin_specs = get_bin_specs(hists["date:age"], skip_first_axis=True)
    assert bin_specs["bin_width"] == 2.0
    assert bin_specs["bin_offset"] == 9.5
Beispiel #3
0
def test_make_histograms_with_time_axis():

    hists, features, bin_specs, time_axis, var_dtype = \
        make_histograms(pytest.test_df, time_axis=True, ret_specs=True)

    assert len(hists) == 20
    assert len(features) == 20
    assert len(bin_specs) == 20
    assert len(var_dtype) == 21
    assert time_axis == 'date'
    assert 'date:age' in hists
    h = hists['date:age']
    assert h.binWidth == 751582381944448.0
    for cols in features:
        cols = cols.split(':')
        assert len(cols) == 2 and cols[0] == 'date'
    for f, bs in bin_specs.items():
        assert len(bs) == 2
    assert 'date:age' in bin_specs
    dateage = bin_specs['date:age']
    assert dateage[0]['bin_width'] == 751582381944448.0
    assert dateage[1]['bin_width'] == 2.0
    assert dateage[1]['bin_offset'] == 9.5

    # test get_bin_specs 1
    bin_specs = get_bin_specs(hists)
    assert 'date:age' in bin_specs
    dateage = bin_specs['date:age']
    assert dateage[0]['bin_width'] == 751582381944448.0
    assert dateage[1]['bin_width'] == 2.0
    assert dateage[1]['bin_offset'] == 9.5

    # test get_bin_specs 2
    bin_specs = get_bin_specs(hists, skip_first_axis=True)
    assert 'age' in bin_specs
    age = bin_specs['age']
    assert age['bin_width'] == 2.0
    assert age['bin_offset'] == 9.5

    # test get_bin_specs 3
    bin_specs = get_bin_specs(hists['date:age'])
    assert bin_specs[0]['bin_width'] == 751582381944448.0
    assert bin_specs[1]['bin_width'] == 2.0
    assert bin_specs[1]['bin_offset'] == 9.5

    # test get_bin_specs 4
    bin_specs = get_bin_specs(hists['date:age'], skip_first_axis=True)
    assert bin_specs['bin_width'] == 2.0
    assert bin_specs['bin_offset'] == 9.5
def test_make_histograms():

    features = [
        "date",
        "isActive",
        "age",
        "eyeColor",
        "gender",
        "company",
        "latitude",
        "longitude",
        ["isActive", "age"],
        ["latitude", "longitude"],
        "transaction",
    ]
    bin_specs = {
        "transaction": {
            'num': 100,
            'low': -2000,
            'high': 2000
        },
        "longitude": {
            "bin_width": 5,
            "bin_offset": 0
        },
        "latitude": {
            "bin_width": 5,
            "bin_offset": 0
        },
    }

    current_hists = make_histograms(pytest.test_df,
                                    features=features,
                                    binning="unit",
                                    bin_specs=bin_specs)

    assert current_hists["age"].toJson() == pytest.age
    assert current_hists["company"].toJson() == pytest.company
    assert current_hists["date"].toJson() == pytest.date
    assert current_hists["eyeColor"].toJson() == pytest.eyesColor
    assert current_hists["gender"].toJson() == pytest.gender
    assert current_hists["isActive"].toJson() == pytest.isActive
    assert current_hists["isActive:age"].toJson() == pytest.isActive_age
    assert current_hists["latitude"].toJson() == pytest.latitude
    assert current_hists["longitude"].toJson() == pytest.longitude
    assert current_hists["latitude:longitude"].toJson(
    ) == pytest.latitude_longitude
    assert current_hists["transaction"].toJson() == pytest.transaction
Beispiel #5
0
def test_spark_make_histograms(spark_context):
    pytest.age["data"]["name"] = "b'age'"
    pytest.company["data"]["name"] = "b'company'"
    pytest.eyesColor["data"]["name"] = "b'eyeColor'"
    pytest.gender["data"]["name"] = "b'gender'"
    pytest.isActive["data"]["name"] = "b'isActive'"
    pytest.latitude["data"]["name"] = "b'latitude'"
    pytest.longitude["data"]["name"] = "b'longitude'"
    pytest.transaction["data"]["name"] = "b'transaction'"

    pytest.latitude_longitude["data"]["name"] = "b'latitude:longitude'"
    pytest.latitude_longitude["data"]["bins:name"] = "unit_func"

    spark_df = spark_context.createDataFrame(pytest.test_df)

    # test make_histograms() function call with spark df
    current_hists = make_histograms(
        spark_df,
        features=[
            "date",
            "isActive",
            "age",
            "eyeColor",
            "gender",
            "company",
            "latitude",
            "longitude",
            ["isActive", "age"],
            ["latitude", "longitude"],
            "transaction",
        ],
        bin_specs={
            "transaction": {"num": 100, "low": -2000, "high": 2000},
            "longitude": {"bin_width": 5.0, "bin_offset": 0.0},
            "latitude": {"bin_width": 5.0, "bin_offset": 0.0},
        },
        binning="unit",
    )

    assert current_hists["age"].toJson() == pytest.age
    assert current_hists["company"].toJson() == pytest.company
    assert current_hists["eyeColor"].toJson() == pytest.eyesColor
    assert current_hists["gender"].toJson() == pytest.gender
    assert current_hists["latitude"].toJson() == pytest.latitude
    assert current_hists["longitude"].toJson() == pytest.longitude
    assert current_hists["transaction"].toJson() == pytest.transaction
def test_make_histograms_unit_binning():

    hists, features, bin_specs, time_axis, var_dtype = make_histograms(
        pytest.test_df, binning="unit", time_axis="", ret_specs=True)

    assert len(hists) == 21
    assert len(features) == 21
    assert len(bin_specs) == 0
    assert len(var_dtype) == 21
    assert time_axis == ""
    assert "date" in hists
    h = hists["date"]
    assert h.binWidth == 2592000000000000
    for cols in features:
        cols = cols.split(":")
        assert len(cols) == 1
    for f, bs in bin_specs.items():
        assert isinstance(bs, dict)
    assert "age" in hists
    h = hists["age"]
    assert h.binWidth == 1.0
    assert h.origin == 0.0
def test_make_histograms_no_time_axis():

    hists, features, bin_specs, time_axis, var_dtype = make_histograms(
        pytest.test_df, time_axis="", ret_specs=True)

    assert len(hists) == 21
    assert len(features) == 21
    assert len(bin_specs) == 6
    assert len(var_dtype) == 21
    assert time_axis == ""
    assert "date" in hists
    h = hists["date"]
    assert h.binWidth == 751582381944448.0
    for cols in features:
        cols = cols.split(":")
        assert len(cols) == 1
    for f, bs in bin_specs.items():
        assert isinstance(bs, dict)
    assert "age" in bin_specs
    dateage = bin_specs["age"]
    assert dateage["bin_width"] == 2.0
    assert dateage["bin_offset"] == 9.5
Beispiel #8
0
def test_make_histograms_no_time_axis():

    hists, features, bin_specs, time_axis, var_dtype = \
        make_histograms(pytest.test_df, time_axis='', ret_specs=True)

    assert len(hists) == 21
    assert len(features) == 21
    assert len(bin_specs) == 6
    assert len(var_dtype) == 21
    assert time_axis == ''
    assert 'date' in hists
    h = hists['date']
    assert h.binWidth == 751582381944448.0
    for cols in features:
        cols = cols.split(':')
        assert len(cols) == 1
    for f, bs in bin_specs.items():
        assert isinstance(bs, dict)
    assert 'age' in bin_specs
    dateage = bin_specs['age']
    assert dateage['bin_width'] == 2.0
    assert dateage['bin_offset'] == 9.5
Beispiel #9
0
def test_make_histograms_unit_binning():

    hists, features, bin_specs, time_axis, var_dtype = \
        make_histograms(pytest.test_df, binning='unit', time_axis='', ret_specs=True)

    assert len(hists) == 21
    assert len(features) == 21
    assert len(bin_specs) == 0
    assert len(var_dtype) == 21
    assert time_axis == ''
    assert 'date' in hists
    h = hists['date']
    assert h.binWidth == 2592000000000000
    for cols in features:
        cols = cols.split(':')
        assert len(cols) == 1
    for f, bs in bin_specs.items():
        assert isinstance(bs, dict)
    assert 'age' in hists
    h = hists['age']
    assert h.binWidth == 1.0
    assert h.origin == 0.0
def test_histogram_stitching():
    features1 = sorted(["date:isActive", "date:eyeColor", "date:latitude"])
    features2 = sorted(["isActive", "eyeColor", "latitude", "age"])

    hists1 = make_histograms(pytest.test_df, features=features1)
    bs = get_bin_specs(hists1, skip_first_axis=True)
    hists2 = make_histograms(pytest.test_df, features=features2, bin_specs=bs)

    # add 'date' axis to hists2 (ts=50) and stitch with hists1
    hists3 = stitch_histograms(
        hists_basis=hists1, hists_delta=hists2, time_axis="date", time_bin_idx=[50]
    )
    np.testing.assert_array_equal(sorted(hists3.keys()), features1)
    assert hists3["date:isActive"].entries == 800
    assert hists3["date:isActive"].bins[50].entries == 400

    # add 'date' axis to hists2 (ts=50) and hists2 (ts=51) and stitch
    hists3 = stitch_histograms(
        hists_basis=hists2, hists_delta=hists2, time_axis="date", time_bin_idx=[50, 51]
    )
    np.testing.assert_array_equal(
        sorted(hists3.keys()), sorted(features1 + ["date:age"])
    )
    assert hists3["date:age"].entries == 800
    assert hists3["date:age"].bins[50].entries == 400
    assert hists3["date:age"].bins[51].entries == 400

    # add 'date' axis to hists2 and hists2 and stitch at auto-bins 0, 1
    hists3 = stitch_histograms(hists_basis=hists2, hists_delta=hists2, time_axis="date")
    np.testing.assert_array_equal(
        sorted(hists3.keys()), sorted(features1 + ["date:age"])
    )
    assert hists3["date:age"].entries == 800
    assert 0 in hists3["date:age"].bins
    assert 1 in hists3["date:age"].bins

    # add 'date' axis to hists2 and hists2 and stitch at bin 50 and auto-bins 51
    hists3 = stitch_histograms(
        hists_basis=hists2, hists_delta=hists2, time_axis="date", time_bin_idx=50
    )
    np.testing.assert_array_equal(
        sorted(hists3.keys()), sorted(features1 + ["date:age"])
    )
    assert hists3["date:age"].entries == 800
    assert 51 in hists3["date:age"].bins

    # no stitching b/c no overlap, returns hists1
    hists3 = stitch_histograms(hists_basis=hists1, hists_delta=hists2)
    np.testing.assert_array_equal(sorted(hists3.keys()), features1)
    assert hists3["date:latitude"].entries == 400
    assert 50 not in hists3["date:latitude"].bins

    # add hists1 to hists1
    hists3 = stitch_histograms(hists_basis=hists1, hists_delta=hists1)
    np.testing.assert_array_equal(sorted(hists3.keys()), features1)
    assert hists3["date:latitude"].entries == 800
    assert 50 not in hists3["date:latitude"].bins

    # add hists2 to hists2
    hists3 = stitch_histograms(hists_basis=hists2, hists_delta=hists2)
    np.testing.assert_array_equal(sorted(hists3.keys()), features2)
    assert hists3["age"].entries == 800

    # add hists1 to hists1
    hists3 = stitch_histograms(hists_basis=hists1, hists_delta=hists1, mode="replace")
    np.testing.assert_array_equal(sorted(hists3.keys()), features1)
    assert hists3["date:latitude"].entries == 400

    # add 'date' axis to hists2 (ts=50) and stitch with hists1
    hists3 = stitch_histograms(
        hists_basis=hists1,
        hists_delta=hists2,
        time_axis="date",
        time_bin_idx=[1],
        mode="replace",
    )
    np.testing.assert_array_equal(sorted(hists3.keys()), features1)
    assert hists3["date:isActive"].bins[1].entries == 400
    assert hists3["date:isActive"].entries == 777