Exemple #1
0
def test_all_options_hartigan():
    """
    Test whether Hartigan works when changing all options.
    """
    cd = CallDays(
        subscriber_locations("2016-01-01",
                             "2016-01-04",
                             level="versioned-site"))

    har = HartiganCluster(calldays=cd, radius=50, buffer=2,
                          call_threshold=2).to_geopandas()
    assert set(har.columns) == set([
        "subscriber", "geometry", "rank", "calldays", "site_id", "version",
        "centroid"
    ])
Exemple #2
0
def test_call_threshold_works(get_dataframe):
    """
    Test whether a call threshold above 1 limits the number of clusters.
    """
    cd = CallDays(
        subscriber_locations("2016-01-01",
                             "2016-01-04",
                             level="versioned-site"))

    hartigan = HartiganCluster(calldays=cd, radius=50)
    har_df = hartigan.to_geopandas()
    assert any(har_df.calldays == 1)
    har_df_higher_call_threshold = get_dataframe(
        HartiganCluster(calldays=cd, radius=50, call_threshold=2))
    assert len(har_df) > len(har_df_higher_call_threshold)
def test_call_days_returns_expected_counts_per_subscriber_tower(get_dataframe):
    """
    Test that the returned data is correct for a given subscriber-location pair.
    """
    test_values = (
        ("Z89mWDgZrr3qpnlB", "m9jL23", "2016-01-01", "2016-01-03", 2),
        ("Z89mWDgZrr3qpnlB", "qvkp6J", "2016-01-01", "2016-01-08", 4),
        ("038OVABN11Ak4W5P", "QeBRM8", "2016-01-01", "2016-01-03", 1),
        ("038OVABN11Ak4W5P", "nWM8R3", "2016-01-01", "2016-01-08", 5),
    )
    for (subscriber, location, start, end, calls) in test_values:
        cd = CallDays(subscriber_locations(start, end, level="versioned-site"))
        df = get_dataframe(cd).query(
            'subscriber == "{}" & site_id == "{}"'.format(
                subscriber, location))
        assert df.calldays.values[0] == calls
def test_column_names_meaningful_locations(get_column_names_from_run):
    """ Test that column_names property matches head(0) for meaningfullocations"""
    mfl = MeaningfulLocations(
        clusters=HartiganCluster(
            calldays=CallDays(subscriber_locations=subscriber_locations(
                start="2016-01-01", stop="2016-01-02",
                level="versioned-site")),
            radius=1,
        ),
        scores=EventScore(start="2016-01-01",
                          stop="2016-01-02",
                          level="versioned-site"),
        labels=labels,
        label="evening",
    )

    assert get_column_names_from_run(mfl) == mfl.column_names
Exemple #5
0
def test_buffered_hartigan():
    """
    Test whether Hartigan produces buffered clusters when buffer is larger than 0.
    """
    cd = CallDays(
        subscriber_locations("2016-01-01",
                             "2016-01-04",
                             level="versioned-site"))

    har = HartiganCluster(calldays=cd, radius=50, buffer=2).to_geopandas()
    areas = har.geometry.area
    # since the mock data does not have geom_area in the site table we either
    # get the clusters with area equivalent to 2km2 (areas below are in degrees) or None.
    min_area = areas.min()
    max_area = areas.max()
    assert min_area == pytest.approx(0.0011327683603873115)
    assert max_area == pytest.approx(0.001166624454009738)
def test_join_with_polygon(get_dataframe, get_length):
    """
    Test that flowmachine.JoinToLocation can get the (arbitrary) polygon
    of each cell.
    """
    ul = subscriber_locations("2016-01-05", "2016-01-07", level="cell")
    j = JoinToLocation(
        ul,
        level="polygon",
        column_name="admin3pcod",
        polygon_table="geography.admin3",
        geom_col="geom",
    )
    df = get_dataframe(j)

    expected_cols = sorted(["admin3pcod", "location_id", "subscriber", "time"])
    assert sorted(df.columns) == expected_cols
    assert len(df) == get_length(ul)
Exemple #7
0
def test_bigger_radius_yields_fewer_clusters(get_dataframe):
    """
    Test whether bigger radius yields fewer clusters per subscriber
    """
    radius = [1, 2, 5, 10, 50]
    cd = CallDays(
        subscriber_locations("2016-01-01",
                             "2016-01-04",
                             level="versioned-site"))

    h = get_dataframe(HartiganCluster(calldays=cd, radius=radius[0]))
    nclusters_small_radius = h.groupby("subscriber").size()

    for r in radius[1:]:
        h = get_dataframe(HartiganCluster(calldays=cd, radius=r))
        nclusters_big_radius = h.groupby("subscriber").size()
        assert all(nclusters_small_radius >= nclusters_big_radius)
        nclusters_small_radius = nclusters_big_radius
def test_join_with_versioned_cells(get_dataframe, get_length):
    """
    Test that flowmachine.JoinToLocation can fetch the cell version.
    """
    ul = subscriber_locations("2016-01-05", "2016-01-07", level="cell")
    df = get_dataframe(JoinToLocation(ul, level="versioned-cell"))
    # As our database is complete we should not drop any rows
    assert len(df) == get_length(ul)
    # These should all be version zero, these are the towers before the changeover date, or those that
    # have not moved.
    should_be_version_zero = df[(df.time <= move_date) |
                                (~df.location_id.isin(moving_sites))]

    # These should all be one, they are the ones after the change over time that have moved.
    should_be_version_one = df[(df.time > move_date)
                               & (df.location_id.isin(moving_sites))]

    assert (should_be_version_zero.version == 0).all()
    assert (should_be_version_one.version == 1).all()
def test_join_with_lat_lon(get_dataframe):
    """
    Test that flowmachine.JoinToLocation can get the lat-lon values of the cell
    """
    ul = subscriber_locations("2016-01-05", "2016-01-07", level="cell")
    df = get_dataframe(JoinToLocation(ul, level="lat-lon"))

    expected_cols = sorted(["subscriber", "time", "location_id", "lat", "lon"])
    assert sorted(df.columns) == expected_cols
    # Pick out one cell that moves location and assert that the
    # lat-lons are right
    focal_cell = "dJb0Wd"
    lat1, long1 = (27.648837800000003, 83.09284486)
    lat2, long2 = (27.661443318109132, 83.25769074752517)
    post_move = df[(df.time > move_date) & (df["location_id"] == focal_cell)]
    pre_move = df[(df.time < move_date) & (df["location_id"] == focal_cell)]
    # And check them all one-by-one
    np.isclose(pre_move.lat, lat1).all()
    np.isclose(pre_move.lon, long1).all()
    np.isclose(post_move.lat, lat2).all()
    np.isclose(post_move.lon, long2).all()
def _make_meaningful_locations_object(
    *,
    start_date,
    stop_date,
    label,
    labels,
    subscriber_subset,
    tower_cluster_call_threshold,
    tower_cluster_radius,
    tower_day_of_week_scores,
    tower_hour_of_day_scores,
):
    q_subscriber_locations = subscriber_locations(
        start=start_date,
        stop=stop_date,
        level="versioned-site",  # note this 'level' is not the same as the exposed parameter 'aggregation_unit'
        subscriber_subset=subscriber_subset,
    )
    q_call_days = CallDays(subscriber_locations=q_subscriber_locations)
    q_hartigan_cluster = HartiganCluster(
        calldays=q_call_days,
        radius=tower_cluster_radius,
        call_threshold=tower_cluster_call_threshold,
        buffer=0,  # we're not exposing 'buffer', apparently, so we're hard-coding it
    )
    q_event_score = EventScore(
        start=start_date,
        stop=stop_date,
        score_hour=tower_hour_of_day_scores,
        score_dow=tower_day_of_week_scores,
        level="versioned-site",  # note this 'level' is not the same as the exposed parameter 'aggregation_unit'
        subscriber_subset=subscriber_subset,
    )
    return MeaningfulLocations(
        clusters=q_hartigan_cluster,
        labels=labels,
        scores=q_event_score,
        label=label,
    )
def test_meaningful_locations_aggregate_disallowed_level_raises():
    """ Test that a bad level raises a BadLevelError"""

    with pytest.raises(BadLevelError):
        mfl_agg = MeaningfulLocationsAggregate(
            meaningful_locations=MeaningfulLocations(
                clusters=HartiganCluster(
                    calldays=CallDays(
                        subscriber_locations=subscriber_locations(
                            start="2016-01-01",
                            stop="2016-01-02",
                            level="versioned-site",
                        )),
                    radius=1,
                ),
                scores=EventScore(start="2016-01-01",
                                  stop="2016-01-02",
                                  level="versioned-site"),
                labels=labels,
                label="evening",
            ),
            level="NOT_A_LEVEL",
        )
def test_meaningful_locations_od_raises_for_bad_level(exemplar_level_param,
                                                      get_dataframe):
    """
    Test that od on meaningful locations raises a BadLevelError for a bad level.
    """
    mfl = MeaningfulLocations(
        clusters=HartiganCluster(
            calldays=CallDays(subscriber_locations=subscriber_locations(
                start="2016-01-01", stop="2016-01-02",
                level="versioned-site")),
            radius=1,
        ),
        scores=EventScore(start="2016-01-01",
                          stop="2016-01-02",
                          level="versioned-site"),
        labels=labels,
        label="evening",
    )

    with pytest.raises(BadLevelError):
        mfl_od = MeaningfulLocationsOD(meaningful_locations_a=mfl,
                                       meaningful_locations_b=mfl,
                                       level="NOT_A_LEVEL")
Exemple #13
0
def test_different_call_days_format(get_dataframe):
    """
    Test whether we can pass different call days format such as table name, SQL query and CallDays class.
    """
    cd = CallDays(
        subscriber_locations("2016-01-01",
                             "2016-01-04",
                             level="versioned-site"))
    har = get_dataframe(HartiganCluster(calldays=cd, radius=50))
    assert isinstance(har, pd.DataFrame)

    cd.store().result()

    har = get_dataframe(
        HartiganCluster(calldays=Table(cd.fully_qualified_table_name),
                        radius=50))
    assert isinstance(har, pd.DataFrame)

    cd_query = cd.get_query()
    har = get_dataframe(
        HartiganCluster(calldays=CustomQuery(cd_query, cd.column_names),
                        radius=50))
    assert isinstance(har, pd.DataFrame)
def test_meaningful_locations_results(label, expected_number_of_clusters,
                                      get_dataframe):
    """
    Test that MeaningfulLocations returns expected results and counts clusters per subscriber correctly.
    """
    mfl = MeaningfulLocations(
        clusters=HartiganCluster(
            calldays=CallDays(subscriber_locations=subscriber_locations(
                start="2016-01-01", stop="2016-01-02",
                level="versioned-site")),
            radius=1,
        ),
        scores=EventScore(start="2016-01-01",
                          stop="2016-01-02",
                          level="versioned-site"),
        labels=labels,
        label=label,
    )
    mfl_df = get_dataframe(mfl)
    assert len(mfl_df) == expected_number_of_clusters
    count_clusters = mfl_df.groupby(["subscriber", "label", "n_clusters"],
                                    as_index=False).count()
    # Check that query has correctly counted the number of clusters per subscriber
    assert all(count_clusters.n_clusters == count_clusters.cluster)
Exemple #15
0
def test_first_call_day_in_first_cluster(get_dataframe):
    """
    Test that the first ranked call day of each subscriber is in the first cluster of each subscriber.
    """
    cd = CallDays(
        subscriber_locations("2016-01-01",
                             "2016-01-04",
                             level="versioned-site"))
    cd_df = get_dataframe(cd)

    hartigan = HartiganCluster(calldays=cd, radius=50)
    har_df = hartigan.to_geopandas()
    cd_first = cd_df[["subscriber", "site_id",
                      "version"]].groupby("subscriber").first()
    har_first = (har_df[["subscriber", "site_id",
                         "version"]].groupby("subscriber").first())

    joined = cd_first.join(har_first, lsuffix="_cd", rsuffix="_har")
    s = joined.apply(
        lambda x:
        (x.site_id_cd in x.site_id_har) and (x.version_cd in x.version_har),
        axis=1,
    )
    assert all(s)
def test_calldays_column_names(exemplar_level_param):
    """Test that CallDays column_names property is correct"""
    cd = CallDays(
        subscriber_locations("2016-01-01", "2016-01-03",
                             **exemplar_level_param))
    assert cd.head(0).columns.tolist() == cd.column_names