Example #1
0
def test_assert_similar_hists():
    """ Test assert on similarity of list of histograms

    Check similarity of: type, n-dim, sub-hists, specific type attributes
    """
    # dummy dataset with mixed types
    # convert timestamp (col D) to nanosec since 1970-1-1
    df = pd.util.testing.makeMixedDataFrame()
    df['date'] = df['D'].apply(to_ns)

    # building 1d-, 2d-, and 3d-histogram (iteratively)
    hist0 = hg.Bin(5, 0, 5, unit('A'))
    hist1 = hg.Categorize(unit('C'))
    hist2 = hg.Bin(5, 0, 5, unit('A'), value=hist1)
    hist3 = hg.Categorize(unit('C'), value=hist0)

    hist4 = hg.SparselyBin(origin=pd.Timestamp('2009-01-01').value,
                           binWidth=pd.Timedelta(days=1).value,
                           quantity=unit('date'),
                           value=hist2)
    hist5 = hg.SparselyBin(origin=pd.Timestamp('2009-01-01').value,
                           binWidth=pd.Timedelta(days=1).value,
                           quantity=unit('date'),
                           value=hist3)
    # fill them
    for hist in [hist0, hist1, hist2, hist3, hist4, hist5]:
        hist.fill.numpy(df)

    hc0 = HistogramContainer(hist0)
    hc1 = HistogramContainer(hist1)
    hc2 = HistogramContainer(hist2)
    hc3 = HistogramContainer(hist3)
    hc4 = HistogramContainer(hist4)
    hc5 = HistogramContainer(hist5)

    for hc in [hc0, hc1, hc2, hc3, hc4, hc5]:
        assert check_similar_hists([hc, hc])

    args01 = ['']
    args23 = ['']
    args45 = ['']

    try:
        assert_similar_hists([hc0, hc1])
    except AssertionError as e:
        args01 = e.args

    try:
        assert_similar_hists([hc2, hc3])
    except AssertionError as e:
        args23 = e.args

    try:
        assert_similar_hists([hc4, hc5])
    except AssertionError as e:
        args45 = e.args

    assert args01[0] == 'Input histograms are not all similar.'
    assert args23[0] == 'Input histograms are not all similar.'
    assert args45[0] == 'Input histograms are not all similar.'
Example #2
0
def test_assert_similar_hists():
    """Test assert on similarity of list of histograms

    Check similarity of: type, n-dim, sub-hists, specific type attributes
    """
    # dummy dataset with mixed types
    # convert timestamp (col D) to nanosec since 1970-1-1
    df = pd.util.testing.makeMixedDataFrame()
    df["date"] = df["D"].apply(to_ns)

    # building 1d-, 2d-, and 3d-histogram (iteratively)
    hist0 = hg.Bin(5, 0, 5, unit("A"))
    hist1 = hg.Categorize(unit("C"))
    hist2 = hg.Bin(5, 0, 5, unit("A"), value=hist1)
    hist3 = hg.Categorize(unit("C"), value=hist0)

    hist4 = hg.SparselyBin(
        origin=pd.Timestamp("2009-01-01").value,
        binWidth=pd.Timedelta(days=1).value,
        quantity=unit("date"),
        value=hist2,
    )
    hist5 = hg.SparselyBin(
        origin=pd.Timestamp("2009-01-01").value,
        binWidth=pd.Timedelta(days=1).value,
        quantity=unit("date"),
        value=hist3,
    )
    # fill them
    for hist in [hist0, hist1, hist2, hist3, hist4, hist5]:
        hist.fill.numpy(df)

    for hist in [hist0, hist1, hist2, hist3, hist4, hist5]:
        assert check_similar_hists([hist, hist])

    args01 = [""]
    args23 = [""]
    args45 = [""]

    try:
        assert_similar_hists([hist0, hist1])
    except ValueError as e:
        args01 = e.args

    try:
        assert_similar_hists([hist2, hist3])
    except ValueError as e:
        args23 = e.args

    try:
        assert_similar_hists([hist4, hist5])
    except ValueError as e:
        args45 = e.args

    assert args01[0] == "Input histograms are not all similar."
    assert args23[0] == "Input histograms are not all similar."
    assert args45[0] == "Input histograms are not all similar."
Example #3
0
def test_check_similar_hists():
    """Test similarity of list of histograms

    Check similarity of: type, n-dim, sub-hists, specific type attributes
    """
    # dummy dataset with mixed types
    # convert timestamp (col D) to nanosec since 1970-1-1
    df = pd.util.testing.makeMixedDataFrame()
    df["date"] = df["D"].apply(to_ns)

    # building 1d-, 2d-, and 3d-histogram (iteratively)
    hist0 = hg.Bin(5, 0, 5, unit("A"))
    hist1 = hg.Categorize(unit("C"))
    hist2 = hg.Bin(5, 0, 5, unit("A"), value=hist1)
    hist3 = hg.Categorize(unit("C"), value=hist0)
    hist4 = hg.SparselyBin(
        origin=pd.Timestamp("2009-01-01").value,
        binWidth=pd.Timedelta(days=1).value,
        quantity=unit("date"),
        value=hist2,
    )
    hist5 = hg.SparselyBin(
        origin=pd.Timestamp("2009-01-01").value,
        binWidth=pd.Timedelta(days=1).value,
        quantity=unit("date"),
        value=hist3,
    )
    # fill them
    for hist in [hist0, hist1, hist2, hist3, hist4, hist5]:
        hist.fill.numpy(df)

    hc0 = HistogramContainer(hist0)
    hc1 = HistogramContainer(hist1)
    hc2 = HistogramContainer(hist2)
    hc3 = HistogramContainer(hist3)
    hc4 = HistogramContainer(hist4)
    hc5 = HistogramContainer(hist5)

    for hc in [hc0, hc1, hc2, hc3, hc4, hc5]:
        assert check_similar_hists([hc, hc])

    assert not check_similar_hists([hc0, hc1])
    assert not check_similar_hists([hc2, hc3])
    assert not check_similar_hists([hc4, hc5])
Example #4
0
def test_hist_splitter_filter():
    """ Test of hist_splitter option filter_empty_split_hists

    One of the split histograms of type date:A_score:num_employees is empty and only contains a NaN.
    In this test, those empty split-histograms are *not* removed, leading to split-histograms of
    inconsistent types.
    """

    hist_list = ["date:A_score:num_employees"]
    features = ["A_score:num_employees"]

    pipeline = Pipeline(modules=[
        JsonReader(
            file_path=resources.data("example_histogram.json"),
            store_key="example_hist",
        ),
        HistSplitter(
            read_key="example_hist",
            store_key="output_hist",
            features=hist_list,
            filter_empty_split_hists=False,
        ),
    ])
    datastore = pipeline.transform(datastore={})

    assert "output_hist" in datastore and isinstance(datastore["output_hist"],
                                                     dict)
    assert len(datastore["output_hist"].keys()) == len(features)
    for f in features:
        assert f in datastore["output_hist"]
    for f in features:
        assert isinstance(datastore["output_hist"][f], pd.DataFrame)

    for f in features:
        df = datastore["output_hist"][f]
        split_list = df.reset_index().to_dict("records")
        hlist = [s["histogram"] for s in split_list]
        check = check_similar_hists(hlist)
        assert check is False