def test_score(self):
     now = util.utcnow()
     assert round(station_score(Dummy(
         now, now, 0, 1), now), 2) == 0.05
     assert round(station_score(Dummy(
         now - timedelta(days=1), now, 10, 2), now), 2) == 0.1
     assert round(station_score(Dummy(
         now - timedelta(days=5), now, 10, 2), now), 2) == 0.5
     assert round(station_score(Dummy(
         now - timedelta(days=10), now, 10, 2), now), 2) == 1.0
     assert round(station_score(Dummy(
         now - timedelta(days=10), now, 10, 64), now), 2) == 6.0
     assert round(station_score(Dummy(
         now - timedelta(days=10), now, 10, 1024), now), 2) == 10.0
     assert round(station_score(Dummy(
         now - timedelta(days=10), now, 0, 1024), now), 2) == 0.5
     assert round(station_score(Dummy(
         now - timedelta(days=70), now - timedelta(days=40),
         10, 1024), now), 2) == 7.07
     assert round(station_score(Dummy(
         now - timedelta(days=190), now - timedelta(days=180),
         10, 1024), now), 2) == 3.78
     assert round(station_score(Dummy(
         now - timedelta(days=190), now - timedelta(days=180),
         10, 64), now), 2) == 2.27
Exemple #2
0
    def test_blue(self, geoip_db, http_session, session, source, stats):
        now = util.utcnow()
        region = GEOCODER.regions_for_mcc(235, metadata=True)[0]
        blue1 = BlueShardFactory(samples=10)
        blue2 = BlueShardFactory(samples=20)
        blue3 = BlueShardFactory.build(region='DE', samples=100)
        session.flush()

        query = self.model_query(geoip_db,
                                 http_session,
                                 session,
                                 stats,
                                 blues=[blue1, blue2, blue3])
        results = source.search(query)
        self.check_model_results(results, [region])
        best_result = results.best()
        assert best_result.region_code == region.code
        assert (best_result.score == station_score(blue1, now) +
                station_score(blue2, now))
        stats.check(counter=[
            (self.api_type + '.source', [
                'key:test', 'region:none', 'source:internal', 'accuracy:low',
                'status:hit'
            ]),
        ])
Exemple #3
0
    def test_blue(self, geoip_db, http_session, session, source, metricsmock):
        now = util.utcnow()
        region = GEOCODER.regions_for_mcc(235, metadata=True)[0]
        blue1 = BlueShardFactory(samples=10)
        blue2 = BlueShardFactory(samples=20)
        blue3 = BlueShardFactory.build(region="DE", samples=100)
        session.flush()

        query = self.model_query(
            geoip_db, http_session, session, blues=[blue1, blue2, blue3]
        )
        results = source.search(query)
        self.check_model_results(results, [region])
        best_result = results.best()
        assert best_result.region_code == region.code
        assert best_result.score == station_score(blue1, now) + station_score(
            blue2, now
        )
        assert metricsmock.has_record(
            "incr",
            self.api_type + ".source",
            value=1,
            tags=[
                "key:test",
                "region:none",
                "source:internal",
                "accuracy:low",
                "status:hit",
            ],
        )
Exemple #4
0
    def test_multiple_cells(self, geoip_db, http_session, session, source):
        now = util.utcnow()
        cell = CellShardFactory(samples=100)
        cell2 = CellShardFactory(
            radio=cell.radio,
            mcc=cell.mcc,
            mnc=cell.mnc,
            lac=cell.lac,
            cid=cell.cid + 1,
            lat=cell.lat + 1.0,
            lon=cell.lon + 1.0,
            samples=10,
        )
        session.flush()

        query = self.model_query(geoip_db,
                                 http_session,
                                 session,
                                 cells=[cell, cell2])
        results = source.search(query)
        self.check_model_results(
            results,
            [cell],
            lat=cell.lat + 0.3333333,
            lon=cell.lon + 0.3333333,
            accuracy=CELL_MAX_ACCURACY,
        )
        assert results.best().score == station_score(
            cell, now) + station_score(cell2, now)
Exemple #5
0
    def test_cluster_score_over_size(self, geoip_db, http_session, session, source):
        now = util.utcnow()
        yesterday = now - timedelta(days=1)
        last_week = now - timedelta(days=7)
        three_months = now - timedelta(days=90)
        four_months = now - timedelta(days=120)

        wifi11 = WifiShardFactory(samples=20, created=last_week, modified=yesterday)
        wifi12 = WifiShardFactory(
            lat=wifi11.lat + 0.00003,
            lon=wifi11.lon,
            samples=30,
            created=yesterday,
            modified=now,
        )
        wifi13 = WifiShardFactory(
            lat=wifi11.lat - 0.00003,
            lon=wifi11.lon,
            samples=10,
            created=yesterday,
            modified=now,
        )
        wifi21 = WifiShardFactory(
            lat=wifi11.lat + 1.0,
            lon=wifi11.lon + 1.0,
            samples=40,
            created=four_months,
            modified=three_months,
        )
        wifi22 = WifiShardFactory(
            lat=wifi21.lat,
            lon=wifi21.lon,
            samples=50,
            created=three_months,
            modified=last_week,
        )
        session.flush()

        query = self.model_query(
            geoip_db,
            http_session,
            session,
            wifis=[wifi11, wifi12, wifi13, wifi21, wifi22],
        )
        results = source.search(query)
        assert len(results) == 2
        best_result = results.best()
        assert round(best_result.lat, 7) == round(wifi21.lat, 7)
        assert round(best_result.lon, 7) == round(wifi21.lon, 7)
        assert round(best_result.accuracy, 2) == 10.0
        assert round(best_result.score, 2) == round(
            station_score(wifi21, now) + station_score(wifi22, now), 2
        )
        other_result = [res for res in results if res.score < best_result.score][0]
        assert round(other_result.lat, 4) == round(wifi11.lat, 4)
        assert round(other_result.lon, 4) == round(wifi11.lon, 4)
Exemple #6
0
    def test_top_results_in_noisy_cluster(
        self, geoip_db, http_session, session, source
    ):
        now = util.utcnow()
        # all these should wind up in the same cluster since
        # the WiFis are spaced in increments of (+0.1m, +0.12m)
        wifi1 = WifiShardFactory.build()
        wifis = []
        for i in range(0, MAX_WIFIS_IN_CLUSTER + 10):
            wifis.append(
                WifiShardFactory(
                    lat=wifi1.lat + i * 0.000001,
                    lon=wifi1.lon + i * 0.0000012,
                    samples=100 - i,
                )
            )
        session.flush()

        # calculate expected result
        score = sum([station_score(wifi, now) for wifi in wifis])

        query = self.model_query(geoip_db, http_session, session, wifis=wifis)
        for i, entry in enumerate(query.wifi):
            entry.signalStrength = -50 - i

        results = source.search(query)
        result = results.best()
        assert round(result.lat, 4) == round(wifi1.lat, 4)
        assert round(result.lon, 4) == round(wifi1.lon, 4)
        assert round(result.score, 4) == round(score, 4)
Exemple #7
0
def cluster_cells(cells, lookups, min_age=0):
    """
    Cluster cells by area.
    """
    now = util.utcnow()
    today = now.date()

    # Create a dict of cell ids mapped to their age and signal strength.
    obs_data = {}
    for lookup in lookups:
        obs_data[decode_cellid(lookup.cellid)] = (max(
            abs(lookup.age or min_age),
            1000), lookup.signalStrength or MIN_CELL_SIGNAL[lookup.radioType])

    areas = defaultdict(list)
    for cell in cells:
        areas[area_id(cell)].append(cell)

    clusters = []
    for area_cells in areas.values():
        clusters.append(
            numpy.array(
                [(cell.lat, cell.lon, cell.radius, obs_data[cell.cellid][0],
                  obs_data[cell.cellid][1], station_score(cell, now),
                  encode_cellid(*cell.cellid), bool(cell.last_seen >= today))
                 for cell in area_cells],
                dtype=NETWORK_DTYPE))

    return clusters
 def test_block_last(self):
     now = util.utcnow()
     assert round(station_score(Dummy(
         now - timedelta(days=70),
         now - timedelta(days=60),
         10, 64,
         (now - timedelta(days=65)).date()), now), 2) == 1.73
Exemple #9
0
def cluster_cells(cells, lookups, min_age=0):
    """
    Cluster cells by area.
    """
    now = util.utcnow()
    today = now.date()

    # Create a dict of cell ids mapped to their age and signal strength.
    obs_data = {}
    for lookup in lookups:
        obs_data[decode_cellid(lookup.cellid)] = (
            max(abs(lookup.age or min_age), 1000),
            lookup.signalStrength or MIN_CELL_SIGNAL[lookup.radioType])

    areas = defaultdict(list)
    for cell in cells:
        areas[area_id(cell)].append(cell)

    clusters = []
    for area_cells in areas.values():
        clusters.append(numpy.array([(
            cell.lat, cell.lon, cell.radius,
            obs_data[cell.cellid][0],
            obs_data[cell.cellid][1],
            station_score(cell, now),
            encode_cellid(*cell.cellid),
            bool(cell.last_seen >= today))
            for cell in area_cells],
            dtype=NETWORK_DTYPE))

    return clusters
 def test_last_seen(self):
     now = util.utcnow()
     assert round(station_score(Dummy(
         now - timedelta(days=70),
         now - timedelta(days=60),
         10, 64,
         (now - timedelta(days=65)).date(),
         (now - timedelta(days=58)).date()), now), 2) == 2.42
Exemple #11
0
    def test_cell(self, geoip_db, http_session, session, source):
        now = util.utcnow()
        cell = CellShardFactory(samples=10)
        session.flush()

        query = self.model_query(geoip_db, http_session, session, cells=[cell])
        results = source.search(query)
        self.check_model_results(results, [cell])
        assert results.best().score == station_score(cell, now)
Exemple #12
0
    def test_cell(self, geoip_db, http_session, session, source, stats):
        now = util.utcnow()
        cell = CellShardFactory(samples=10)
        session.flush()

        query = self.model_query(
            geoip_db, http_session, session, stats,
            cells=[cell])
        results = source.search(query)
        self.check_model_results(results, [cell])
        assert results.best().score == station_score(cell, now)
Exemple #13
0
    def test_multiple_cells(self, geoip_db, http_session,
                            session, source, stats):
        now = util.utcnow()
        cell = CellShardFactory(samples=100)
        cell2 = CellShardFactory(radio=cell.radio, mcc=cell.mcc, mnc=cell.mnc,
                                 lac=cell.lac, cid=cell.cid + 1,
                                 lat=cell.lat + 1.0, lon=cell.lon + 1.0,
                                 samples=10)
        session.flush()

        query = self.model_query(
            geoip_db, http_session, session, stats,
            cells=[cell, cell2])
        results = source.search(query)
        self.check_model_results(
            results, [cell],
            lat=cell.lat + 0.3333333, lon=cell.lon + 0.3333333,
            accuracy=CELL_MAX_ACCURACY)
        assert (results.best().score ==
                station_score(cell, now) + station_score(cell2, now))
Exemple #14
0
    def test_wifi(self, geoip_db, http_session, session, source, metricsmock):
        now = util.utcnow()
        region = GEOCODER.regions_for_mcc(235, metadata=True)[0]
        wifi1 = WifiShardFactory(samples=10)
        wifi2 = WifiShardFactory(samples=20)
        wifi3 = WifiShardFactory.build(region="DE", samples=100)
        session.flush()

        query = self.model_query(geoip_db,
                                 http_session,
                                 session,
                                 wifis=[wifi1, wifi2, wifi3])
        results = source.search(query)
        self.check_model_results(results, [region])
        best_result = results.best()
        assert best_result.region_code == region.code
        assert best_result.score == station_score(wifi1, now) + station_score(
            wifi2, now)
        metricsmock.assert_incr_once(
            self.api_type + ".source",
            tags=["key:test", "source:internal", "accuracy:low", "status:hit"],
        )
Exemple #15
0
    def test_blue(self, geoip_db, http_session,
                  session, source, stats):
        now = util.utcnow()
        region = GEOCODER.regions_for_mcc(235, metadata=True)[0]
        blue1 = BlueShardFactory(samples=10)
        blue2 = BlueShardFactory(samples=20)
        blue3 = BlueShardFactory.build(region='DE', samples=100)
        session.flush()

        query = self.model_query(
            geoip_db, http_session, session, stats,
            blues=[blue1, blue2, blue3])
        results = source.search(query)
        self.check_model_results(results, [region])
        best_result = results.best()
        assert best_result.region_code == region.code
        assert (best_result.score ==
                station_score(blue1, now) + station_score(blue2, now))
        stats.check(counter=[
            (self.api_type + '.source',
                ['key:test', 'region:none', 'source:internal',
                 'accuracy:low', 'status:hit']),
        ])
Exemple #16
0
    def search_wifi(self, query):
        results = self.result_list()

        now = util.utcnow()
        regions = defaultdict(int)
        wifis = query_macs(query, query.wifi, self.raven_client, WifiShard)
        for wifi in wifis:
            regions[wifi.region] += station_score(wifi, now)

        for code, score in regions.items():
            region = GEOCODER.region_for_code(code)
            if region:
                results.add(self.result_type(
                    region_code=code,
                    region_name=region.name,
                    accuracy=region.radius,
                    score=score))

        return results
Exemple #17
0
    def search_blue(self, query):
        results = self.result_list()

        now = util.utcnow()
        regions = defaultdict(int)
        blues = query_macs(query, query.blue, self.raven_client, BlueShard)
        for blue in blues:
            regions[blue.region] += station_score(blue, now)

        for code, score in regions.items():
            region = GEOCODER.region_for_code(code)
            if region:
                results.add(
                    self.result_type(region_code=code,
                                     region_name=region.name,
                                     accuracy=region.radius,
                                     score=score))

        return results
Exemple #18
0
    def search_wifi(self, query):
        results = self.result_list()

        now = util.utcnow()
        regions = defaultdict(int)
        wifis = query_macs(query, query.wifi, self.raven_client, WifiShard)
        for wifi in wifis:
            regions[wifi.region] += station_score(wifi, now)

        for code, score in regions.items():
            region = GEOCODER.region_for_code(code)
            if region:
                results.add(
                    self.result_type(region_code=code,
                                     region_name=region.name,
                                     accuracy=region.radius,
                                     score=score))

        return results
Exemple #19
0
    def search_blue(self, query):
        results = self.result_list()

        now = util.utcnow()
        regions = defaultdict(int)
        blues = query_macs(query, query.blue, self.raven_client, BlueShard)
        for blue in blues:
            regions[blue.region] += station_score(blue, now)

        for code, score in regions.items():
            region = GEOCODER.region_for_code(code)
            if region:
                results.add(self.result_type(
                    region_code=code,
                    region_name=region.name,
                    accuracy=region.radius,
                    score=score))

        return results
Exemple #20
0
def cluster_networks(models, lookups,
                     min_age=0, min_radius=None, min_signal=None,
                     max_distance=None):
    """
    Given a list of database models and lookups, return
    a list of clusters of nearby networks.
    """
    now = util.utcnow()
    today = now.date()

    # Create a dict of macs mapped to their age and signal strength.
    obs_data = {}
    for lookup in lookups:
        obs_data[decode_mac(lookup.mac)] = (
            max(abs(lookup.age or min_age), 1000),
            lookup.signalStrength or min_signal)

    networks = numpy.array([(
        model.lat, model.lon,
        model.radius or min_radius,
        obs_data[model.mac][0],
        obs_data[model.mac][1],
        station_score(model, now),
        encode_mac(model.mac),
        bool(model.last_seen >= today))
        for model in models],
        dtype=NETWORK_DTYPE)

    # Only consider clusters that have at least 2 found networks
    # inside them. Otherwise someone could use a combination of
    # one real network and one fake and therefor not found network to
    # get the position of the real network.
    length = len(networks)
    if length < 2:
        # Not enough networks to form a valid cluster.
        return []

    positions = networks[['lat', 'lon']]
    if length == 2:
        one = positions[0]
        two = positions[1]
        if distance(one[0], one[1],
                    two[0], two[1]) <= max_distance:
            # Only two networks and they agree, so cluster them.
            return [networks]
        else:
            # Or they disagree forming two clusters of size one,
            # neither of which is large enough to be returned.
            return []

    # Calculate the condensed distance matrix based on distance in meters.
    # This avoids calculating the square form, which would calculate
    # each value twice and avoids calculating the diagonal of zeros.
    # We avoid the special cases for length < 2 with the above checks.
    # See scipy.spatial.distance.squareform and
    # https://stackoverflow.com/questions/13079563
    dist_matrix = numpy.zeros(length * (length - 1) // 2, dtype=numpy.double)
    for i, (a, b) in enumerate(itertools.combinations(positions, 2)):
        dist_matrix[i] = distance(a[0], a[1], b[0], b[1])

    link_matrix = hierarchy.linkage(dist_matrix, method='complete')
    assignments = hierarchy.fcluster(
        link_matrix, max_distance, criterion='distance', depth=2)

    indexed_clusters = defaultdict(list)
    for i, net in zip(assignments, networks):
        indexed_clusters[i].append(net)

    clusters = []
    for values in indexed_clusters.values():
        if len(values) >= 2:
            clusters.append(numpy.array(values, dtype=NETWORK_DTYPE))

    return clusters
Exemple #21
0
def cluster_networks(models,
                     lookups,
                     min_age=0,
                     min_radius=None,
                     min_signal=None,
                     max_distance=None):
    """
    Given a list of database models and lookups, return
    a list of clusters of nearby networks.
    """
    now = util.utcnow()
    today = now.date()

    # Create a dict of macs mapped to their age and signal strength.
    obs_data = {}
    for lookup in lookups:
        obs_data[decode_mac(lookup.mac)] = (
            max(abs(lookup.age or min_age), 1000),
            lookup.signalStrength or min_signal,
        )

    networks = numpy.array(
        [(
            model.lat,
            model.lon,
            model.radius or min_radius,
            obs_data[model.mac][0],
            obs_data[model.mac][1],
            station_score(model, now),
            encode_mac(model.mac, codec="base64"),
            bool(model.last_seen is not None and model.last_seen >= today),
        ) for model in models],
        dtype=NETWORK_DTYPE,
    )

    # Only consider clusters that have at least 2 found networks
    # inside them. Otherwise someone could use a combination of
    # one real network and one fake and therefor not found network to
    # get the position of the real network.
    length = len(networks)
    if length < 2:
        # Not enough networks to form a valid cluster.
        return []

    positions = networks[["lat", "lon"]]
    if length == 2:
        one = positions[0]
        two = positions[1]
        if distance(one[0], one[1], two[0], two[1]) <= max_distance:
            # Only two networks and they agree, so cluster them.
            return [networks]
        else:
            # Or they disagree forming two clusters of size one,
            # neither of which is large enough to be returned.
            return []

    # Calculate the condensed distance matrix based on distance in meters.
    # This avoids calculating the square form, which would calculate
    # each value twice and avoids calculating the diagonal of zeros.
    # We avoid the special cases for length < 2 with the above checks.
    # See scipy.spatial.distance.squareform and
    # https://stackoverflow.com/questions/13079563
    dist_matrix = numpy.zeros(length * (length - 1) // 2, dtype=numpy.double)
    for i, (a, b) in enumerate(itertools.combinations(positions, 2)):
        dist_matrix[i] = distance(a[0], a[1], b[0], b[1])

    link_matrix = hierarchy.linkage(dist_matrix, method="complete")
    assignments = hierarchy.fcluster(link_matrix,
                                     max_distance,
                                     criterion="distance",
                                     depth=2)

    indexed_clusters = defaultdict(list)
    for i, net in zip(assignments, networks):
        indexed_clusters[i].append(net)

    clusters = []
    for values in indexed_clusters.values():
        if len(values) >= 2:
            clusters.append(numpy.array(values, dtype=NETWORK_DTYPE))

    return clusters