Exemple #1
0
def test_make_histograms_with_time_axis():

    hists, features, bin_specs, time_axis, var_dtype = \
        make_histograms(pytest.test_df, time_axis=True, ret_specs=True)

    assert len(hists) == 20
    assert len(features) == 20
    assert len(bin_specs) == 20
    assert len(var_dtype) == 21
    assert time_axis == 'date'
    assert 'date:age' in hists
    h = hists['date:age']
    assert h.binWidth == 751582381944448.0
    for cols in features:
        cols = cols.split(':')
        assert len(cols) == 2 and cols[0] == 'date'
    for f, bs in bin_specs.items():
        assert len(bs) == 2
    assert 'date:age' in bin_specs
    dateage = bin_specs['date:age']
    assert dateage[0]['bin_width'] == 751582381944448.0
    assert dateage[1]['bin_width'] == 2.0
    assert dateage[1]['bin_offset'] == 9.5

    # test get_bin_specs 1
    bin_specs = get_bin_specs(hists)
    assert 'date:age' in bin_specs
    dateage = bin_specs['date:age']
    assert dateage[0]['bin_width'] == 751582381944448.0
    assert dateage[1]['bin_width'] == 2.0
    assert dateage[1]['bin_offset'] == 9.5

    # test get_bin_specs 2
    bin_specs = get_bin_specs(hists, skip_first_axis=True)
    assert 'age' in bin_specs
    age = bin_specs['age']
    assert age['bin_width'] == 2.0
    assert age['bin_offset'] == 9.5

    # test get_bin_specs 3
    bin_specs = get_bin_specs(hists['date:age'])
    assert bin_specs[0]['bin_width'] == 751582381944448.0
    assert bin_specs[1]['bin_width'] == 2.0
    assert bin_specs[1]['bin_offset'] == 9.5

    # test get_bin_specs 4
    bin_specs = get_bin_specs(hists['date:age'], skip_first_axis=True)
    assert bin_specs['bin_width'] == 2.0
    assert bin_specs['bin_offset'] == 9.5
def test_make_histograms_with_time_axis():

    hists, features, bin_specs, time_axis, var_dtype = make_histograms(
        pytest.test_df, time_axis=True, ret_specs=True)

    assert len(hists) == 20
    assert len(features) == 20
    assert len(bin_specs) == 20
    assert len(var_dtype) == 21
    assert time_axis == "date"
    assert "date:age" in hists
    h = hists["date:age"]
    assert h.binWidth == 751582381944448.0
    for cols in features:
        cols = cols.split(":")
        assert len(cols) == 2 and cols[0] == "date"
    for f, bs in bin_specs.items():
        assert len(bs) == 2
    assert "date:age" in bin_specs
    dateage = bin_specs["date:age"]
    assert dateage[0]["bin_width"] == 751582381944448.0
    assert dateage[1]["bin_width"] == 2.0
    assert dateage[1]["bin_offset"] == 9.5

    # test get_bin_specs 1
    bin_specs = get_bin_specs(hists)
    assert "date:age" in bin_specs
    dateage = bin_specs["date:age"]
    assert dateage[0]["bin_width"] == 751582381944448.0
    assert dateage[1]["bin_width"] == 2.0
    assert dateage[1]["bin_offset"] == 9.5

    # test get_bin_specs 2
    bin_specs = get_bin_specs(hists, skip_first_axis=True)
    assert "age" in bin_specs
    age = bin_specs["age"]
    assert age["bin_width"] == 2.0
    assert age["bin_offset"] == 9.5

    # test get_bin_specs 3
    bin_specs = get_bin_specs(hists["date:age"])
    assert bin_specs[0]["bin_width"] == 751582381944448.0
    assert bin_specs[1]["bin_width"] == 2.0
    assert bin_specs[1]["bin_offset"] == 9.5

    # test get_bin_specs 4
    bin_specs = get_bin_specs(hists["date:age"], skip_first_axis=True)
    assert bin_specs["bin_width"] == 2.0
    assert bin_specs["bin_offset"] == 9.5
def test_histogram_stitching():
    features1 = sorted(["date:isActive", "date:eyeColor", "date:latitude"])
    features2 = sorted(["isActive", "eyeColor", "latitude", "age"])

    hists1 = make_histograms(pytest.test_df, features=features1)
    bs = get_bin_specs(hists1, skip_first_axis=True)
    hists2 = make_histograms(pytest.test_df, features=features2, bin_specs=bs)

    # add 'date' axis to hists2 (ts=50) and stitch with hists1
    hists3 = stitch_histograms(
        hists_basis=hists1, hists_delta=hists2, time_axis="date", time_bin_idx=[50]
    )
    np.testing.assert_array_equal(sorted(hists3.keys()), features1)
    assert hists3["date:isActive"].entries == 800
    assert hists3["date:isActive"].bins[50].entries == 400

    # add 'date' axis to hists2 (ts=50) and hists2 (ts=51) and stitch
    hists3 = stitch_histograms(
        hists_basis=hists2, hists_delta=hists2, time_axis="date", time_bin_idx=[50, 51]
    )
    np.testing.assert_array_equal(
        sorted(hists3.keys()), sorted(features1 + ["date:age"])
    )
    assert hists3["date:age"].entries == 800
    assert hists3["date:age"].bins[50].entries == 400
    assert hists3["date:age"].bins[51].entries == 400

    # add 'date' axis to hists2 and hists2 and stitch at auto-bins 0, 1
    hists3 = stitch_histograms(hists_basis=hists2, hists_delta=hists2, time_axis="date")
    np.testing.assert_array_equal(
        sorted(hists3.keys()), sorted(features1 + ["date:age"])
    )
    assert hists3["date:age"].entries == 800
    assert 0 in hists3["date:age"].bins
    assert 1 in hists3["date:age"].bins

    # add 'date' axis to hists2 and hists2 and stitch at bin 50 and auto-bins 51
    hists3 = stitch_histograms(
        hists_basis=hists2, hists_delta=hists2, time_axis="date", time_bin_idx=50
    )
    np.testing.assert_array_equal(
        sorted(hists3.keys()), sorted(features1 + ["date:age"])
    )
    assert hists3["date:age"].entries == 800
    assert 51 in hists3["date:age"].bins

    # no stitching b/c no overlap, returns hists1
    hists3 = stitch_histograms(hists_basis=hists1, hists_delta=hists2)
    np.testing.assert_array_equal(sorted(hists3.keys()), features1)
    assert hists3["date:latitude"].entries == 400
    assert 50 not in hists3["date:latitude"].bins

    # add hists1 to hists1
    hists3 = stitch_histograms(hists_basis=hists1, hists_delta=hists1)
    np.testing.assert_array_equal(sorted(hists3.keys()), features1)
    assert hists3["date:latitude"].entries == 800
    assert 50 not in hists3["date:latitude"].bins

    # add hists2 to hists2
    hists3 = stitch_histograms(hists_basis=hists2, hists_delta=hists2)
    np.testing.assert_array_equal(sorted(hists3.keys()), features2)
    assert hists3["age"].entries == 800

    # add hists1 to hists1
    hists3 = stitch_histograms(hists_basis=hists1, hists_delta=hists1, mode="replace")
    np.testing.assert_array_equal(sorted(hists3.keys()), features1)
    assert hists3["date:latitude"].entries == 400

    # add 'date' axis to hists2 (ts=50) and stitch with hists1
    hists3 = stitch_histograms(
        hists_basis=hists1,
        hists_delta=hists2,
        time_axis="date",
        time_bin_idx=[1],
        mode="replace",
    )
    np.testing.assert_array_equal(sorted(hists3.keys()), features1)
    assert hists3["date:isActive"].bins[1].entries == 400
    assert hists3["date:isActive"].entries == 777