Ejemplo n.º 1
0
def test_custom_query_hash():
    """Test that CustomQuery hashing is same with case & space differences."""
    base = CustomQuery("SELECT * from foo")
    case_mismatch = CustomQuery("select * FROM foo")
    space_mismatch = CustomQuery("""select    *
                                 FROM foo""")
    assert base.md5 == case_mismatch.md5
    assert base.md5 == space_mismatch.md5
def test_dropna(get_length):
    """
    Test that we are able to keep rows with NA values.
    """

    start, stop = "2016-01-01", "2016-01-03"
    msisdn = "1vGR8kp342yxEpwY"
    sql = """
    select 
        msisdn as subscriber,
        2 as val 
    from 
        events.calls 
    where 
        msisdn = '{}' 
    limit 1
    """.format(
        msisdn
    )

    metrics = [CustomQuery(sql, ["subscriber"]), RadiusOfGyration(start, stop)]
    fc = feature_collection(metrics, dropna=False)

    # usully without dropna=False this query would only return
    # a single row. We check that this is not the case.
    assert get_length(fc) > 1
def test_subsetting_of_query(get_dataframe):
    """
    Check that query ids and length of results of some subsetted queries are as expected.
    """
    selected_subscriber_ids = [
        "1jwYL3Nl1Y46lNeQ",
        "nLvm2gVnEdg7lzqX",
        "jwKJorl0yBrZX5N8",
    ]
    custom_query = CustomQuery(
        "SELECT duration, msisdn as subscriber FROM events.calls WHERE duration < 10",
        ["duration", "subscriber"],
    )
    subsetter_1 = SubscriberSubsetterForAllSubscribers()
    subsetter_2 = SubscriberSubsetterForExplicitSubset(selected_subscriber_ids)
    subsetter_3 = SubscriberSubsetterForFlowmachineQuery(custom_query)

    dl_0 = daily_location(date="2016-01-01")
    dl_1 = daily_location(date="2016-01-01", subscriber_subset=subsetter_1)
    dl_2 = daily_location(date="2016-01-01", subscriber_subset=subsetter_2)
    dl_3 = daily_location(date="2016-01-01", subscriber_subset=subsetter_3)

    assert 499 == len(get_dataframe(dl_0))
    assert 499 == len(get_dataframe(dl_1))
    assert 3 == len(get_dataframe(dl_2))
    assert 26 == len(get_dataframe(dl_3))
Ejemplo n.º 4
0
def test_daily_location_5_df(get_dataframe, diff_reporter):
    """
    Daily location query with non-default parameters returns the expected data.
    """
    subset_query = CustomQuery(
        """
        SELECT DISTINCT msisdn AS subscriber
        FROM events.calls
        WHERE (   (datetime >= '2016-01-01 08:00:00' AND datetime <= '2016-01-01 20:00:00' AND substring(tac::TEXT, 0, 2) = '68')
               OR (datetime >= '2016-01-07 14:00:00' AND datetime <= '2016-01-07 15:00:00' AND duration < 400))
        """,
        ["subscriber"],
    )

    dl = daily_location(
        "2016-01-02",
        spatial_unit=make_spatial_unit("admin", level=3),
        hours=(4, 9),
        method="most-common",
        # subscriber_identifier="imei",
        # ignore_nulls=False,
        subscriber_subset=subset_query,
    )
    df = get_dataframe(dl)
    verify(df.to_csv(), diff_reporter)
def test_raises_error_if_flowmachine_query_does_not_contain_subscriber_column(
):
    """
    An error is raised when creating a subsetter from a flowmachine query that doesn't contain a column named 'subscriber'.
    """
    flowmachine_query = CustomQuery(
        "SELECT msisdn, duration FROM events.calls", ["msisdn", "duration"])
    with pytest.raises(ValueError):
        _ = make_subscriber_subsetter(flowmachine_query)
Ejemplo n.º 6
0
def test_get_geom_query_column_names(
    exemplar_spatial_unit_param, get_column_names_from_run
):
    """
    Test that the get_geom_query method returns a query with the correct columns.
    """
    if not exemplar_spatial_unit_param.has_geography:
        pytest.skip("CellSpatialUnit does not have a get_geom_query method")
    geom_query = exemplar_spatial_unit_param.get_geom_query()
    cols = exemplar_spatial_unit_param.location_id_columns + ["geom"]
    cq = CustomQuery(geom_query, cols)
    assert sorted(get_column_names_from_run(cq)) == sorted(cols)
Ejemplo n.º 7
0
def test_create_histogram_censors(get_dataframe):
    """
    Histogram should be censored if any bin has a count below 15.
    """
    query = CustomQuery("SELECT * FROM generate_series(0, 10) AS t(value)",
                        column_names=["value"])

    agg = HistogramAggregation(metric=query, bins=5)
    df = get_dataframe(agg)

    assert len(df) == 1
    assert df.value[0] is None
    assert df.lower_edge[0] is None
    assert df.upper_edge[0] is None
Ejemplo n.º 8
0
def test_contact_reference_location_no_spatial_unit_raises():
    """ Test ValueError is raised for contact_location without spatial_unit attribute. """
    cb = ContactBalance("2016-01-01", "2016-01-03")
    # by encapsulating ModalLocations in a CustomQuery we remove the spatial_unit
    # attribute from it which should raise an error
    ml = ModalLocation(*[
        daily_location(
            d,
            spatial_unit=make_spatial_unit("versioned-cell"),
            subscriber_subset=cb.counterparts_subset(include_subscribers=True),
        ) for d in list_of_dates("2016-01-01", "2016-01-03")
    ])
    ml = CustomQuery(ml.get_query(), ml.column_names)
    with pytest.raises(ValueError):
        query = ContactReferenceLocationStats(cb, ml)
Ejemplo n.º 9
0
def test_daily_location_4_df(get_dataframe, diff_reporter):
    """
    Regression test; the expected result is empty because the given subscriber does not make any calls on the given date.
    """
    subset_query = CustomQuery(
        "SELECT * FROM (VALUES ('dr9xNYK006wykgXj')) as tmp (subscriber)",
        ["subscriber"],
    )
    dl = daily_location(
        "2016-01-05",
        table="events.calls",
        hours=(22, 6),
        subscriber_subset=subset_query,
    )
    df = get_dataframe(dl)
    verify(df.to_csv(), diff_reporter)
Ejemplo n.º 10
0
def test_daily_location_4_sql(diff_reporter):
    """
    Regression test; this verifies the SQL statement for the test below (which checks the resulting dataframe)
    """
    subset_query = CustomQuery(
        "SELECT * FROM (VALUES ('dr9xNYK006wykgXj')) as tmp (subscriber)",
        column_names=["subscriber"],
    )
    dl = daily_location(
        "2016-01-05",
        table="events.calls",
        hours=(22, 6),
        subscriber_subset=subset_query,
    )
    sql = pretty_sql(dl.get_query())
    verify(sql, diff_reporter)
Ejemplo n.º 11
0
    def test_different_call_days_format(self):
        """
        Test whether we can pass different call days format such as table name, SQL query and CallDays class.
        """
        cd = CallDays("2016-01-01", "2016-01-04", level="versioned-site")
        har = HartiganCluster(cd, 50).get_dataframe()
        self.assertIsInstance(har, pd.DataFrame)

        cd.store().result()

        har = HartiganCluster(Table(cd.table_name), 50).get_dataframe()
        self.assertIsInstance(har, pd.DataFrame)

        cd_query = cd.get_query()
        har = HartiganCluster(CustomQuery(cd_query), 50).get_dataframe()
        self.assertIsInstance(har, pd.DataFrame)
Ejemplo n.º 12
0
def test_daily_location_1_sql(diff_reporter):
    """
    Daily location query with non-default parameters returns the expected data.
    """
    subset_query = CustomQuery(
        "SELECT DISTINCT msisdn AS subscriber FROM events.calls WHERE msisdn in ('GNLM7eW5J5wmlwRa', 'e6BxY8mAP38GyAQz', '1vGR8kp342yxEpwY')",
        column_names=["subscriber"],
    )
    dl = daily_location(
        "2016-01-05",
        spatial_unit=make_spatial_unit("cell"),
        hours=(23, 5),
        method="last",
        subscriber_subset=subset_query,
    )
    sql = pretty_sql(dl.get_query())
    diff_reporter(sql)
Ejemplo n.º 13
0
def test_daily_location_6_df(get_dataframe, diff_reporter):
    """
    Regression test; the expected result is empty because the given subscriber does not make any calls on the given date.
    """
    subset_query = CustomQuery(
        """
        SELECT outgoing, datetime, duration, msisdn AS subscriber
        FROM events.calls
        WHERE datetime::date = '2016-01-01' AND duration > 2000
        """,
        ["outgoing", "datetime", "duration", "subscriber"],
    )
    dl = daily_location("2016-01-03",
                        table="events.calls",
                        subscriber_subset=subset_query)
    df = get_dataframe(dl)
    verify(df.to_csv(), diff_reporter)
Ejemplo n.º 14
0
def test_daily_location_6_sql(diff_reporter):
    """
    Regression test; this verifies the SQL statement for the test below (which checks the resulting dataframe)
    """
    subset_query = CustomQuery(
        """
        SELECT outgoing, datetime, duration, msisdn AS subscriber
        FROM events.calls
        WHERE datetime::date = '2016-01-01' AND duration > 2000
        """,
        ["subscriber"],
    )
    dl = daily_location("2016-01-03",
                        table="events.calls",
                        subscriber_subset=subset_query)
    sql = pretty_sql(dl.get_query())
    verify(sql, diff_reporter)
Ejemplo n.º 15
0
def test_daily_location_3_df(get_dataframe, diff_reporter):
    """
    Daily location query with non-default parameters returns the expected data.
    """
    subset_query = CustomQuery(
        "SELECT DISTINCT msisdn AS subscriber FROM events.calls WHERE msisdn in ('GNLM7eW5J5wmlwRa', 'e6BxY8mAP38GyAQz', '1vGR8kp342yxEpwY')",
        column_names=["subscriber"],
    )
    dl = daily_location(
        "2016-01-05",
        spatial_unit=make_spatial_unit("cell"),
        hours=(23, 5),
        method="last",
        # subscriber_identifier="imei",
        # ignore_nulls=False,
        subscriber_subset=subset_query,
    )
    df = get_dataframe(dl)
    verify(df.to_csv(), diff_reporter)
Ejemplo n.º 16
0
def test_daily_location_1_df(get_dataframe, diff_reporter):
    """
    Daily location query with non-default parameters returns the expected data.
    """
    # Note that subscriber `1vGR8kp342yxEpwY` should be missing from the result
    # because they have no event on 2016-01-05 after 11pm or before 5am.
    subset_query = CustomQuery(
        "SELECT DISTINCT msisdn AS subscriber FROM events.calls WHERE msisdn in ('GNLM7eW5J5wmlwRa', 'e6BxY8mAP38GyAQz', '1vGR8kp342yxEpwY')",
        column_names=["subscriber"],
    )
    dl = daily_location(
        "2016-01-05",
        spatial_unit=make_spatial_unit("cell"),
        hours=(23, 5),
        method="last",
        subscriber_subset=subset_query,
    )
    df = get_dataframe(dl)
    diff_reporter(df.to_csv())
Ejemplo n.º 17
0
def test_non_default_value_column(get_dataframe):
    """
    Test a non-default value column.
    """
    query = CustomQuery(
        "SELECT * FROM generate_series(0, 100) AS t(non_default_value_column)",
        column_names=["non_default_value_column"],
    )

    agg = HistogramAggregation(metric=query,
                               bins=5,
                               value_column="non_default_value_column")
    df = get_dataframe(agg)
    numpy_histogram, numpy_bins = np.histogram(
        get_dataframe(query).non_default_value_column, bins=5)
    assert df.value.sum() == len(get_dataframe(query))
    assert numpy_histogram.tolist() == df.value.tolist()
    assert numpy_bins.tolist()[:-1] == pytest.approx(df.lower_edge.tolist())
    assert numpy_bins.tolist()[1:] == pytest.approx(df.upper_edge.tolist())
def test_daily_location_3_sql(diff_reporter):
    """
    Daily location query with non-default parameters returns the expected data.
    """
    subset_query = CustomQuery(
        "SELECT DISTINCT msisdn AS subscriber FROM events.calls WHERE msisdn in ('GNLM7eW5J5wmlwRa', 'e6BxY8mAP38GyAQz', '1vGR8kp342yxEpwY')",
        column_names=["subscriber"],
    )
    dl = daily_location(
        "2016-01-05",
        level="cell",
        hours=(23, 5),
        method="last",
        # subscriber_identifier="imei",
        # column_name="admin2pcod",
        # ignore_nulls=False,
        subscriber_subset=subset_query,
    )
    sql = pretty_sql(dl.get_query())
    verify(sql, diff_reporter)
Ejemplo n.º 19
0
def test_print_dependency_tree():
    """
    Test that the expected dependency tree is printed for a daily location query (with an explicit subset).
    """
    subscriber_subsetter = make_subscriber_subsetter(
        CustomQuery(
            "SELECT duration, msisdn as subscriber FROM events.calls WHERE duration < 10",
            ["duration", "subscriber"],
        ))
    q = daily_location(date="2016-01-02",
                       method="most-common",
                       subscriber_subset=subscriber_subsetter)

    expected_output = textwrap.dedent("""\
        <Query of type: MostFrequentLocation, query_id: 'xxxxx'>
          - <Query of type: PolygonSpatialUnit, query_id: 'xxxxx'>
             - <Table: 'geography.admin3', query_id: 'xxxxx'>
          - <Query of type: SubscriberLocations, query_id: 'xxxxx'>
             - <Query of type: JoinToLocation, query_id: 'xxxxx'>
                - <Query of type: PolygonSpatialUnit, query_id: 'xxxxx'>
                   - <Table: 'geography.admin3', query_id: 'xxxxx'>
                - <Query of type: EventsTablesUnion, query_id: 'xxxxx'>
                   - <Query of type: EventTableSubset, query_id: 'xxxxx'>
                      - <Query of type: CustomQuery, query_id: 'xxxxx'>
                      - <Table: 'events.sms', query_id: 'xxxxx'>
                         - <Table: 'events.sms', query_id: 'xxxxx'>
                   - <Query of type: EventTableSubset, query_id: 'xxxxx'>
                      - <Query of type: CustomQuery, query_id: 'xxxxx'>
                      - <Table: 'events.calls', query_id: 'xxxxx'>
                         - <Table: 'events.calls', query_id: 'xxxxx'>
             - <Query of type: PolygonSpatialUnit, query_id: 'xxxxx'>
                - <Table: 'geography.admin3', query_id: 'xxxxx'>
        """)

    s = StringIO()
    print_dependency_tree(q, stream=s)
    output = s.getvalue()
    output_with_query_ids_replaced = re.sub(r"\b[0-9a-f]+\b", "xxxxx", output)

    assert expected_output == output_with_query_ids_replaced
Ejemplo n.º 20
0
def test_contact_reference_location_stats_custom_geometry(get_dataframe):
    """ Test ContactReferenceLocationStats with custom geometry column. """
    cb = ContactBalance("2016-01-01", "2016-01-03")
    ml = ModalLocation(*[
        daily_location(
            d,
            spatial_unit=make_spatial_unit("versioned-cell"),
            subscriber_subset=cb.counterparts_subset(include_subscribers=True),
        ) for d in list_of_dates("2016-01-01", "2016-01-03")
    ])
    cb.store()
    ml.store()
    ml = CustomQuery(
        f"SELECT subscriber, ST_POINT(lon, lat) AS loc FROM ({ml.get_query()}) _",
        ["subscriber", "loc"],
    )
    query = ContactReferenceLocationStats(cb,
                                          ml,
                                          statistic="avg",
                                          geom_column="loc")
    df = get_dataframe(query).set_index("subscriber")
    assert df.value["gwAynWXp4eWvxGP7"] == pytest.approx(298.7215)
def test_partition_and_order_can_be_ommitted(
    partition_column, order_column, expected, get_dataframe
):
    """
    Test that the filter can be applied without partitioning or ordering.
    """
    test_query = CustomQuery(
        """SELECT * FROM (VALUES 
            (1, 1, 3), (1.1, 1, 2), (0.9, 1, 1), (1.1, 2, 3), 
            (0.95, 2, 2), (2.1, 2, 1), (1.95, 3, 3), (2.0, 3, 2),
             (2.05, 3, 1), (3.11, 4, 4), (2.99, 4, 3), (3.05, 4, 2), 
             (3.0, 4, 1)) as t(value, part, ord)""",
        column_names=["value", "part", "ord"],
    )
    smoothed = get_dataframe(
        IterativeMedianFilter(
            query_to_filter=test_query,
            filter_window_size=3,
            partition_column=partition_column,
            order_column=order_column,
        )
    )
    assert smoothed.value.tolist() == expected
Ejemplo n.º 22
0
def test_different_call_days_format(get_dataframe):
    """
    Test whether we can pass different call days format such as table name, SQL query and CallDays class.
    """
    cd = CallDays(
        SubscriberLocations("2016-01-01",
                            "2016-01-04",
                            spatial_unit=make_spatial_unit("versioned-site")))
    har = get_dataframe(HartiganCluster(calldays=cd, radius=50))
    assert isinstance(har, pd.DataFrame)

    cd.store().result()

    har = get_dataframe(
        HartiganCluster(calldays=Table(cd.fully_qualified_table_name),
                        radius=50))
    assert isinstance(har, pd.DataFrame)

    cd_query = cd.get_query()
    har = get_dataframe(
        HartiganCluster(calldays=CustomQuery(cd_query, cd.column_names),
                        radius=50))
    assert isinstance(har, pd.DataFrame)
Ejemplo n.º 23
0
def test_contact_reference_location_no_subscriber_raises():
    """ Test ValueError is raised for contact_location without subscriber. """
    cb = ContactBalance("2016-01-01", "2016-01-03")
    cl = CustomQuery("SELECT 1 AS foo", ["foo"])
    with pytest.raises(ValueError):
        query = ContactReferenceLocationStats(cb, cl)
        ),
        (
            ("<SUBSCRIBER_ID_1>", "<SUBSCRIBER_ID_2>"),
            SubscriberSubsetterForExplicitSubset,
        ),
        (
            np.array(["<SUBSCRIBER_ID_1>", "<SUBSCRIBER_ID_2>"]),
            SubscriberSubsetterForExplicitSubset,
        ),
        (
            pd.Series(["<SUBSCRIBER_ID_1>", "<SUBSCRIBER_ID_2>"]),
            SubscriberSubsetterForExplicitSubset,
        ),
        (
            CustomQuery(
                "SELECT duration, msisdn as subscriber FROM events.calls WHERE duration < 200",
                ["duration", "subscriber"],
            ),
            SubscriberSubsetterForFlowmachineQuery,
        ),
    ],
)
def test_can_create_subscriber_subsetter_from_different_input_types(
        input_subset, expected_subsetter_type):
    """
    The factory function make_subscriber_subsetter() accepts supported input types and returns an appropriate instance of SubscriberSubsetterBase.
    """
    subsetter = make_subscriber_subsetter(input_subset)
    assert isinstance(subsetter, expected_subsetter_type)


def test_raises_error_if_flowmachine_query_does_not_contain_subscriber_column(