Beispiel #1
0
def process_batch(request, data, errors):
    nickname = request.headers.get('X-Nickname', u'')
    upload_items = flatten_items(data)
    errors = process_upload(nickname, upload_items)

    if errors is SENTINEL:
        return HTTPServiceUnavailable()

    if errors:
        get_stats_client().incr('geosubmit.upload.errors', len(errors))

    result = HTTPOk()
    result.content_type = 'application/json'
    result.body = '{}'
    return result
Beispiel #2
0
def process_single(request):
    stats_client = get_stats_client()

    locate_data, locate_errors = preprocess_request(
        request,
        schema=GeoLocateSchema(),
        extra_checks=(geolocate_validator, ),
        response=JSONParseError,
        accept_empty=True,
    )

    data, errors = preprocess_request(
        request,
        schema=GeoSubmitSchema(),
        extra_checks=(geosubmit_validator, ),
        response=None,
    )
    data = {'items': [data]}

    nickname = request.headers.get('X-Nickname', u'')
    email = request.headers.get('X-Email', u'')
    upload_items = flatten_items(data)
    errors = process_upload(nickname, email, upload_items)

    if errors is not SENTINEL and errors:  # pragma: no cover
        stats_client.incr('geosubmit.upload.errors', len(errors))

    first_item = data['items'][0]
    if first_item['latitude'] == -255 or first_item['longitude'] == -255:
        data = map_data(data['items'][0])
        session = request.db_slave_session
        result = search_all_sources(
            session,
            'geosubmit',
            data,
            client_addr=request.client_addr,
            geoip_db=request.registry.geoip_db,
            api_key_log=getattr(request, 'api_key_log', False),
            api_key_name=getattr(request, 'api_key_name', None))
    else:
        result = {
            'lat': first_item['latitude'],
            'lon': first_item['longitude'],
            'accuracy': first_item['accuracy']
        }

    if result is None:
        stats_client.incr('geosubmit.miss')
        result = HTTPNotFound()
        result.content_type = 'application/json'
        result.body = NOT_FOUND
        return result

    return {
        "location": {
            "lat": result['lat'],
            "lng": result['lon'],
        },
        "accuracy": float(result['accuracy']),
    }
Beispiel #3
0
def process_batch(request, data, errors):
    nickname = request.headers.get('X-Nickname', u'')
    email = request.headers.get('X-Email', u'')
    upload_items = flatten_items(data)
    errors = process_upload(nickname, email, upload_items)

    if errors is SENTINEL:  # pragma: no cover
        return HTTPServiceUnavailable()

    if errors:  # pragma: no cover
        get_stats_client().incr('geosubmit.upload.errors', len(errors))

    result = HTTPOk()
    result.content_type = 'application/json'
    result.body = '{}'
    return result
Beispiel #4
0
def geoip_and_best_guess_country_codes(cell_keys, api_name, client_addr,
                                       geoip_db):
    """
    Return (geoip, alpha2) where geoip is the result of a GeoIP lookup
    and alpha2 is a best-guess ISO 3166 alpha2 country code. The country
    code guess uses both GeoIP and cell MCCs, preferring GeoIP. Return
    None for either field if no data is available.
    """

    stats_client = get_stats_client()
    geoip = None

    if client_addr and geoip_db is not None:
        geoip = geoip_db.geoip_lookup(client_addr)

    cell_countries = []
    cell_mccs = set()
    for cell_key in cell_keys:
        for c in mobile_codes.mcc(str(cell_key.mcc)):
            cell_countries.append(c.alpha2)
            cell_mccs.add(cell_key.mcc)

    if len(cell_mccs) > 1:
        stats_client.incr('%s.anomaly.multiple_mccs' % api_name)

    if geoip:
        # GeoIP always wins if we have it.
        accuracy, city = radius_from_geoip(geoip)
        if city:
            stats_client.incr('%s.geoip_city_found' % api_name)
        else:
            stats_client.incr('%s.geoip_country_found' % api_name)

        if geoip['country_code'] not in cell_countries:
            if cell_countries:
                stats_client.incr('%s.anomaly.geoip_mcc_mismatch' % api_name)
            # Only use the GeoIP country as an additional possible match,
            # but retain the cell countries as a likely match as well.
            cell_countries.append(geoip['country_code'])

        stats_client.incr('%s.country_from_geoip' % api_name)
        geoip_res = {
            'lat': geoip['latitude'],
            'lon': geoip['longitude'],
            'accuracy': accuracy
        }
        return (geoip_res, most_common_elements(cell_countries))

    else:
        stats_client.incr('%s.no_geoip_found' % api_name)

    # Pick the most-commonly-occurring country codes if we got any
    cc = most_common_elements(cell_countries)
    if cc:
        stats_client.incr('%s.country_from_mcc' % api_name)
        return (None, cc)

    stats_client.incr('%s.no_country' % api_name)
    return (None, [])
Beispiel #5
0
def geoip_and_best_guess_country_codes(cell_keys, api_name,
                                       client_addr, geoip_db):
    """
    Return (geoip, alpha2) where geoip is the result of a GeoIP lookup
    and alpha2 is a best-guess ISO 3166 alpha2 country code. The country
    code guess uses both GeoIP and cell MCCs, preferring GeoIP. Return
    None for either field if no data is available.
    """

    stats_client = get_stats_client()
    geoip = None

    if client_addr and geoip_db is not None:
        geoip = geoip_db.geoip_lookup(client_addr)

    cell_countries = []
    cell_mccs = set()
    for cell_key in cell_keys:
        for c in mobile_codes.mcc(str(cell_key.mcc)):
            cell_countries.append(c.alpha2)
            cell_mccs.add(cell_key.mcc)

    if len(cell_mccs) > 1:
        stats_client.incr('%s.anomaly.multiple_mccs' % api_name)

    if geoip:
        # GeoIP always wins if we have it.
        accuracy, city = radius_from_geoip(geoip)
        if city:
            stats_client.incr('%s.geoip_city_found' % api_name)
        else:
            stats_client.incr('%s.geoip_country_found' % api_name)

        if geoip['country_code'] not in cell_countries:
            if cell_countries:
                stats_client.incr('%s.anomaly.geoip_mcc_mismatch' % api_name)
            # Only use the GeoIP country as an additional possible match,
            # but retain the cell countries as a likely match as well.
            cell_countries.append(geoip['country_code'])

        stats_client.incr('%s.country_from_geoip' % api_name)
        geoip_res = {
            'lat': geoip['latitude'],
            'lon': geoip['longitude'],
            'accuracy': accuracy
        }
        return (geoip_res, most_common_elements(cell_countries))

    else:
        stats_client.incr('%s.no_geoip_found' % api_name)

    # Pick the most-commonly-occurring country codes if we got any
    cc = most_common_elements(cell_countries)
    if cc:
        stats_client.incr('%s.country_from_mcc' % api_name)
        return (None, cc)

    stats_client.incr('%s.no_country' % api_name)
    return (None, [])
Beispiel #6
0
def process_single(request):
    stats_client = get_stats_client()

    locate_data, locate_errors = preprocess_request(
        request,
        schema=GeoLocateSchema(),
        extra_checks=(geolocate_validator, ),
        response=JSONParseError,
        accept_empty=True,
    )

    data, errors = preprocess_request(
        request,
        schema=GeoSubmitSchema(),
        extra_checks=(geosubmit_validator,),
        response=None,
    )
    data = {'items': [data]}

    nickname = request.headers.get('X-Nickname', u'')
    email = request.headers.get('X-Email', u'')
    upload_items = flatten_items(data)
    errors = process_upload(nickname, email, upload_items)

    if errors is not SENTINEL and errors:  # pragma: no cover
        stats_client.incr('geosubmit.upload.errors', len(errors))

    first_item = data['items'][0]
    if first_item['latitude'] == -255 or first_item['longitude'] == -255:
        data = map_data(data['items'][0])
        session = request.db_slave_session
        result = search_all_sources(
            session, 'geosubmit', data,
            client_addr=request.client_addr,
            geoip_db=request.registry.geoip_db,
            api_key_log=getattr(request, 'api_key_log', False),
            api_key_name=getattr(request, 'api_key_name', None))
    else:
        result = {'lat': first_item['latitude'],
                  'lon': first_item['longitude'],
                  'accuracy': first_item['accuracy']}

    if result is None:
        stats_client.incr('geosubmit.miss')
        result = HTTPNotFound()
        result.content_type = 'application/json'
        result.body = NOT_FOUND
        return result

    return {
        "location": {
            "lat": result['lat'],
            "lng": result['lon'],
        },
        "accuracy": float(result['accuracy']),
    }
Beispiel #7
0
def blacklist_and_remove_moving_stations(session, blacklist_model,
                                         station_type, to_key, join_key,
                                         moving_stations, remove_station):
    moving_keys = []
    utcnow = util.utcnow()
    for station in moving_stations:
        key = to_key(station)
        query = session.query(blacklist_model).filter(
            *join_key(blacklist_model, key))
        b = query.first()
        d = key._asdict()
        moving_keys.append(d)
        if b:
            b.time = utcnow
            b.count += 1
        else:
            b = blacklist_model(**d)
            session.add(b)

    if moving_keys:
        get_stats_client().incr("items.blacklisted.%s_moving" % station_type,
                                len(moving_keys))
        remove_station.delay(moving_keys)
Beispiel #8
0
def blacklist_and_remove_moving_stations(session, blacklist_model,
                                         station_type, to_key, join_key,
                                         moving_stations, remove_station):
    moving_keys = []
    utcnow = util.utcnow()
    for station in moving_stations:
        key = to_key(station)
        query = session.query(blacklist_model).filter(
            *join_key(blacklist_model, key))
        b = query.first()
        d = key._asdict()
        moving_keys.append(d)
        if b:
            b.time = utcnow
            b.count += 1
        else:
            b = blacklist_model(**d)
            session.add(b)

    if moving_keys:
        get_stats_client().incr("items.blacklisted.%s_moving" % station_type,
                                len(moving_keys))
        remove_station.delay(moving_keys)
Beispiel #9
0
def blacklist_and_remove_moving_stations(session, blacklist_model,
                                         station_type, moving_stations,
                                         remove_station):
    moving_keys = []
    utcnow = util.utcnow()
    for station in moving_stations:
        station_key = blacklist_model.to_hashkey(station)
        query = session.query(blacklist_model).filter(
            *blacklist_model.joinkey(station_key))
        blacklisted_station = query.first()
        moving_keys.append(station_key)
        if blacklisted_station:
            blacklisted_station.time = utcnow
            blacklisted_station.count += 1
        else:
            blacklisted_station = blacklist_model(time=utcnow,
                                                  count=1,
                                                  **station_key.__dict__)
            session.add(blacklisted_station)

    if moving_keys:
        get_stats_client().incr("items.blacklisted.%s_moving" % station_type,
                                len(moving_keys))
        remove_station.delay(moving_keys)
Beispiel #10
0
    def __init__(self, api_key_name, api_key_log, api_name):
        """
        A StatsLogger sends counted and timed named statistics to
        a statistic aggregator client.

        :param api_key_name: Human readable API key name
            (for example 'test_1')
        :type api_key_name: str
        :param api_key_log: Gather additional API key specific stats?
        :type api_key_log: bool
        :param api_name: Name of the API, used as stats prefix
            (for example 'geolocate')
        :type api_name: str
        """
        self.api_key_name = api_key_name
        self.api_key_log = api_key_log
        self.api_name = api_name
        self.raven_client = get_raven_client()
        self.stats_client = get_stats_client()
Beispiel #11
0
def process_station_measures(session,
                             entries,
                             station_type,
                             station_model,
                             measure_model,
                             blacklist_model,
                             create_measure,
                             create_key,
                             join_key,
                             userid=None,
                             max_measures_per_station=11000,
                             utcnow=None):

    all_measures = []
    dropped_blacklisted = 0
    dropped_malformed = 0
    dropped_overflow = 0
    stats_client = get_stats_client()
    new_stations = 0
    if utcnow is None:
        utcnow = util.utcnow()
    elif isinstance(utcnow, basestring):
        utcnow = decode_datetime(utcnow)

    # Process entries and group by validated station key
    station_measures = defaultdict(list)
    for entry in entries:
        measure = create_measure(utcnow, entry)

        if not measure:
            dropped_malformed += 1
            continue

        station_measures[create_key(measure)].append(measure)

    # Process measures one station at a time
    for key, measures in station_measures.items():

        incomplete = False
        is_new_station = False

        # Figure out how much space is left for this station.
        free = available_station_space(session, key, station_model, join_key,
                                       max_measures_per_station)
        if free is None:
            is_new_station = True
            free = max_measures_per_station

        if is_new_station:
            # Drop measures for blacklisted stations.
            if blacklisted_station(session, key, blacklist_model, join_key,
                                   utcnow):
                dropped_blacklisted += len(measures)
                continue

            incomplete = incomplete_measure(key)
            if not incomplete:
                # We discovered an actual new complete station.
                new_stations += 1

        # Accept measures up to input-throttling limit, then drop.
        num = 0
        for measure in measures:
            if free <= 0:
                dropped_overflow += 1
                continue
            all_measures.append(measure)
            free -= 1
            num += 1

        # Accept incomplete measures, just don't make stations for them.
        # (station creation is a side effect of count-updating)
        if not incomplete and num > 0:
            create_or_update_station(session, key, station_model, join_key,
                                     utcnow, num)

    # Credit the user with discovering any new stations.
    if userid is not None and new_stations > 0:
        process_score(userid, new_stations, session, key='new_' + station_type)

    if dropped_blacklisted != 0:
        stats_client.incr("items.dropped.%s_ingress_blacklisted" %
                          station_type,
                          count=dropped_blacklisted)

    if dropped_malformed != 0:
        stats_client.incr("items.dropped.%s_ingress_malformed" % station_type,
                          count=dropped_malformed)

    if dropped_overflow != 0:
        stats_client.incr("items.dropped.%s_ingress_overflow" % station_type,
                          count=dropped_overflow)

    stats_client.incr("items.inserted.%s_measures" % station_type,
                      count=len(all_measures))

    session.add_all(all_measures)
    return all_measures
Beispiel #12
0
 def stats_client(self):
     return get_stats_client()
Beispiel #13
0
def search_all_sources(session,
                       api_name,
                       data,
                       client_addr=None,
                       geoip_db=None,
                       api_key_log=False,
                       api_key_name=None,
                       result_type='position'):
    """
    Common code-path for all lookup APIs, using
    WiFi, cell, cell-lac and GeoIP data sources.

    :param session: A database session for queries.
    :param api_name: A string to use in metrics (for example "geolocate").
    :param data: A dict conforming to the search API.
    :param client_addr: The IP address the request came from.
    :param geoip_db: The geoip database.
    :param api_key_log: Enable additional api key specific logging?
    :param api_key_name: The metric friendly api key name.
    :param result_type: What kind of result to return, either a lat/lon
                        position or a country estimate.
    """

    if result_type not in ('country', 'position'):
        raise ValueError('Invalid result_type, must be one of '
                         'position or country')

    stats_client = get_stats_client()
    heka_client = get_heka_client()

    result = None
    result_metric = None

    validated = {
        'wifi': [],
        'cell': [],
        'cell_lac': set(),
        'cell_network': [],
        'cell_lac_network': [],
    }

    # Pre-process wifi data
    for wifi in data.get('wifi', ()):
        wifi = normalized_wifi_dict(wifi)
        if wifi:
            validated['wifi'].append(wifi)

    # Pre-process cell data
    radio = RADIO_TYPE.get(data.get('radio', ''), -1)
    for cell in data.get('cell', ()):
        cell = normalized_cell_dict(cell, default_radio=radio)
        if cell:
            cell_key = to_cellkey(cell)
            validated['cell'].append(cell_key)
            validated['cell_lac'].add(cell_key)

    found_cells = []

    # Query all cells and OCID cells
    for model in Cell, OCIDCell, CellArea:
        cell_filter = []
        for key in validated['cell']:
            # create a list of 'and' criteria for cell keys
            criterion = join_cellkey(model, key)
            cell_filter.append(and_(*criterion))

        if cell_filter:
            # only do a query if we have cell results, or this will match
            # all rows in the table
            load_fields = ('radio', 'mcc', 'mnc', 'lac', 'lat', 'lon', 'range')
            query = (session.query(model).options(
                load_only(*load_fields)).filter(or_(*cell_filter)).filter(
                    model.lat.isnot(None)).filter(model.lon.isnot(None)))

            try:
                found_cells.extend(query.all())
            except Exception:
                heka_client.raven(RAVEN_ERROR)

    if found_cells:
        # Group all found_cellss by location area
        lacs = defaultdict(list)
        for cell in found_cells:
            cellarea_key = (cell.radio, cell.mcc, cell.mnc, cell.lac)
            lacs[cellarea_key].append(cell)

        def sort_lac(v):
            # use the lac with the most values,
            # or the one with the smallest range
            return (len(v), -min([e.range for e in v]))

        # If we get data from multiple location areas, use the one with the
        # most data points in it. That way a lac with a cell hit will
        # have two entries and win over a lac with only the lac entry.
        lac = sorted(lacs.values(), key=sort_lac, reverse=True)

        for cell in lac[0]:
            # The first entry is the key,
            # used only to distinguish cell from lac
            network = Network(key=None,
                              lat=cell.lat,
                              lon=cell.lon,
                              range=cell.range)
            if type(cell) is CellArea:
                validated['cell_lac_network'].append(network)
            else:
                validated['cell_network'].append(network)

    # Always do a GeoIP lookup because it is cheap and we want to
    # report geoip vs. other data mismatches. We may also use
    # the full GeoIP City-level estimate as well, if all else fails.
    (geoip_res,
     countries) = geoip_and_best_guess_country_codes(validated['cell'],
                                                     api_name, client_addr,
                                                     geoip_db, stats_client)

    # First we attempt a "zoom-in" from cell-lac, to cell
    # to wifi, tightening our estimate each step only so
    # long as it doesn't contradict the existing best-estimate
    # nor the possible countries of origin.

    for (data_field, object_field, metric_name, search_fn) in [
        ('cell_lac', 'cell_lac_network', 'cell_lac', search_cell_lac),
        ('cell', 'cell_network', 'cell', search_cell),
        ('wifi', 'wifi', 'wifi', search_wifi)
    ]:

        if validated[data_field]:
            r = None
            try:
                r = search_fn(session, validated[object_field], stats_client,
                              api_name)
            except Exception:
                heka_client.raven(RAVEN_ERROR)
                stats_client.incr('%s.%s_error' % (api_name, metric_name))

            if r is None:
                stats_client.incr('%s.no_%s_found' % (api_name, metric_name))

            else:
                lat = float(r['lat'])
                lon = float(r['lon'])

                stats_client.incr('%s.%s_found' % (api_name, metric_name))

                # Skip any hit that matches none of the possible countries.
                country_match = False
                for country in countries:
                    if location_is_in_country(lat, lon, country, 1):
                        country_match = True
                        break

                if countries and not country_match:
                    stats_client.incr('%s.anomaly.%s_country_mismatch' %
                                      (api_name, metric_name))

                # Always accept the first result we get.
                if result is None:
                    result = r
                    result_metric = metric_name

                # Or any result that appears to be an improvement over the
                # existing best guess.
                elif (distance(float(result['lat']), float(result['lon']), lat,
                               lon) * 1000 <= result['accuracy']):
                    result = r
                    result_metric = metric_name

                else:
                    stats_client.incr('%s.anomaly.%s_%s_mismatch' %
                                      (api_name, metric_name, result_metric))

    # Fall back to GeoIP if nothing has worked yet. We do not
    # include this in the "zoom-in" loop because GeoIP is
    # frequently _wrong_ at the city level; we only want to
    # accept that estimate if we got nothing better from cell
    # or wifi.
    if not result and geoip_res:
        result = geoip_res
        result_metric = 'geoip'

    # Do detailed logging for some api keys
    if api_key_log and api_key_name:
        api_log_metric = None
        wifi_keys = set([w['key'] for w in validated['wifi']])
        if wifi_keys and \
           len(filter_bssids_by_similarity(wifi_keys)) >= MIN_WIFIS_IN_QUERY:
            # Only count requests as WiFi-based if they contain enough
            # distinct WiFi networks to pass our filters
            if result_metric == 'wifi':
                api_log_metric = 'wifi_hit'
            else:
                api_log_metric = 'wifi_miss'
        elif validated['cell']:
            if result_metric == 'cell':
                api_log_metric = 'cell_hit'
            elif result_metric == 'cell_lac':
                api_log_metric = 'cell_lac_hit'
            else:
                api_log_metric = 'cell_miss'
        else:
            if geoip_res:
                api_log_metric = 'geoip_hit'
            else:
                api_log_metric = 'geoip_miss'
        if api_log_metric:
            stats_client.incr('%s.api_log.%s.%s' %
                              (api_name, api_key_name, api_log_metric))

    if not result:
        stats_client.incr('%s.miss' % api_name)
        return None

    stats_client.incr('%s.%s_hit' % (api_name, result_metric))

    if result_type == 'position':
        rounded_result = {
            'lat': round(result['lat'], DEGREE_DECIMAL_PLACES),
            'lon': round(result['lon'], DEGREE_DECIMAL_PLACES),
            'accuracy': round(result['accuracy'], DEGREE_DECIMAL_PLACES),
        }
        stats_client.timing('%s.accuracy.%s' % (api_name, result_metric),
                            rounded_result['accuracy'])
        return rounded_result
    elif result_type == 'country':
        if countries:
            country = iso3166.countries.get(countries[0])
            return {
                'country_name': country.name,
                'country_code': country.alpha2
            }
Beispiel #14
0
def process_observations(observations,
                         session,
                         userid=None,
                         api_key_log=False,
                         api_key_name=None):
    stats_client = get_stats_client()
    positions = []
    cell_observations = []
    wifi_observations = []
    for i, obs in enumerate(observations):
        obs['report_id'] = uuid.uuid1()
        cell, wifi = process_observation(obs, session)
        cell_observations.extend(cell)
        wifi_observations.extend(wifi)
        if cell or wifi:
            positions.append({
                'lat': obs['lat'],
                'lon': obs['lon'],
            })

    if cell_observations:
        # group by and create task per cell key
        stats_client.incr('items.uploaded.cell_observations',
                          len(cell_observations))
        if api_key_log:
            stats_client.incr(
                'items.api_log.%s.uploaded.cell_observations' % api_key_name,
                len(cell_observations))

        cells = defaultdict(list)
        for obs in cell_observations:
            cells[CellObservation.to_hashkey(obs)].append(obs)

        # Create a task per group of 5 cell keys at a time.
        # Grouping them helps in avoiding per-task overhead.
        cells = list(cells.values())
        batch_size = 5
        countdown = 0
        for i in range(0, len(cells), batch_size):
            values = []
            for observations in cells[i:i + batch_size]:
                values.extend(observations)
            # insert observations, expire the task if it wasn't processed
            # after six hours to avoid queue overload, also delay
            # each task by one second more, to get a more even workload
            # and avoid parallel updates of the same underlying stations
            insert_measures_cell.apply_async(args=[values],
                                             kwargs={'userid': userid},
                                             expires=21600,
                                             countdown=countdown)
            countdown += 1

    if wifi_observations:
        # group by WiFi key
        stats_client.incr('items.uploaded.wifi_observations',
                          len(wifi_observations))
        if api_key_log:
            stats_client.incr(
                'items.api_log.%s.uploaded.wifi_observations' % api_key_name,
                len(wifi_observations))

        wifis = defaultdict(list)
        for obs in wifi_observations:
            wifis[WifiObservation.to_hashkey(obs)].append(obs)

        # Create a task per group of 20 WiFi keys at a time.
        # We tend to get a huge number of unique WiFi networks per
        # batch upload, with one to very few observations per WiFi.
        # Grouping them helps in avoiding per-task overhead.
        wifis = list(wifis.values())
        batch_size = 20
        countdown = 0
        for i in range(0, len(wifis), batch_size):
            values = []
            for observations in wifis[i:i + batch_size]:
                values.extend(observations)
            # insert observations, expire the task if it wasn't processed
            # after six hours to avoid queue overload, also delay
            # each task by one second more, to get a more even workload
            # and avoid parallel updates of the same underlying stations
            insert_measures_wifi.apply_async(args=[values],
                                             kwargs={'userid': userid},
                                             expires=21600,
                                             countdown=countdown)
            countdown += 1

    if userid is not None:
        process_score(userid, len(positions), session)
    if positions:
        process_mapstat(session, positions)
Beispiel #15
0
def search_all_sources(session, api_name, data,
                       client_addr=None, geoip_db=None):
    """
    Common code-path for all lookup APIs, using
    WiFi, cell, cell-lac and GeoIP data sources.

    :param session: A database session for queries.
    :param api_name: A string to use in metrics (for example "geolocate").
    :param data: A dict conforming to the search API.
    :param client_addr: The IP address the request came from.
    :param geoip_db: The geoip database.
    """

    stats_client = get_stats_client()
    heka_client = get_heka_client()

    result = None
    result_metric = None

    validated = {
        'wifi': [],
        'cell': [],
        'cell_lac': set(),
        'cell_network': [],
        'cell_lac_network': [],
    }

    # Pass-through wifi data
    validated['wifi'] = data.get('wifi', [])

    # Pre-process cell data
    radio = RADIO_TYPE.get(data.get('radio', ''), -1)
    for cell in data.get('cell', ()):
        cell = normalized_cell_dict(cell, default_radio=radio)
        if cell:
            cell_key = to_cellkey(cell)
            validated['cell'].append(cell_key)
            validated['cell_lac'].add(cell_key._replace(cid=CELLID_LAC))

    # Merge all possible cell and lac keys into one list
    all_cell_keys = []
    all_cell_keys.extend(validated['cell'])
    for key in validated['cell_lac']:
        all_cell_keys.append(key)

    # Do a single query for all cells and lacs at the same time
    try:
        all_networks = query_cell_networks(session, all_cell_keys)
    except Exception:
        heka_client.raven(RAVEN_ERROR)
        all_networks = []
    for network in all_networks:
        if network.key == CELLID_LAC:
            validated['cell_lac_network'].append(network)
        else:
            validated['cell_network'].append(network)

    # Always do a GeoIP lookup because it is cheap and we want to
    # report geoip vs. other data mismatches. We may also use
    # the full GeoIP City-level estimate as well, if all else fails.
    (geoip_res, countries) = geoip_and_best_guess_country_codes(
        validated['cell'], api_name, client_addr, geoip_db)

    # First we attempt a "zoom-in" from cell-lac, to cell
    # to wifi, tightening our estimate each step only so
    # long as it doesn't contradict the existing best-estimate
    # nor the possible countries of origin.

    for (data_field, object_field, metric_name, search_fn) in [
            ('cell_lac', 'cell_lac_network', 'cell_lac', search_cell_lac),
            ('cell', 'cell_network', 'cell', search_cell),
            ('wifi', 'wifi', 'wifi', search_wifi)]:

        if validated[data_field]:
            r = None
            try:
                r = search_fn(session, validated[object_field])
            except Exception:
                heka_client.raven(RAVEN_ERROR)
                stats_client.incr('%s.%s_error' %
                                  (api_name, metric_name))

            if r is None:
                stats_client.incr('%s.no_%s_found' %
                                  (api_name, metric_name))

            else:
                lat = float(r['lat'])
                lon = float(r['lon'])

                stats_client.incr('%s.%s_found' %
                                  (api_name, metric_name))

                # Skip any hit that matches none of the possible countries.
                country_match = False
                for country in countries:
                    if location_is_in_country(lat, lon, country, 1):
                        country_match = True
                        break

                if countries and not country_match:
                    stats_client.incr('%s.anomaly.%s_country_mismatch' %
                                      (api_name, metric_name))

                # Always accept the first result we get.
                if result is None:
                    result = r
                    result_metric = metric_name

                # Or any result that appears to be an improvement over the
                # existing best guess.
                elif (distance(float(result['lat']),
                               float(result['lon']), lat, lon) * 1000
                      <= result['accuracy']):
                    result = r
                    result_metric = metric_name

                else:
                    stats_client.incr('%s.anomaly.%s_%s_mismatch' %
                                      (api_name, metric_name, result_metric))

    # Fall back to GeoIP if nothing has worked yet. We do not
    # include this in the "zoom-in" loop because GeoIP is
    # frequently _wrong_ at the city level; we only want to
    # accept that estimate if we got nothing better from cell
    # or wifi.
    if not result and geoip_res:
        result = geoip_res
        result_metric = 'geoip'

    if not result:
        stats_client.incr('%s.miss' % api_name)
        return None

    rounded_result = {
        'lat': round(result['lat'], DEGREE_DECIMAL_PLACES),
        'lon': round(result['lon'], DEGREE_DECIMAL_PLACES),
        'accuracy': round(result['accuracy'], DEGREE_DECIMAL_PLACES),
    }

    stats_client.incr('%s.%s_hit' % (api_name, result_metric))
    stats_client.timing('%s.accuracy.%s' % (api_name, result_metric),
                        rounded_result['accuracy'])

    return rounded_result
Beispiel #16
0
def process_station_observations(session,
                                 entries,
                                 station_type,
                                 station_model,
                                 observation_model,
                                 blacklist_model,
                                 userid=None,
                                 max_observations_per_station=11000,
                                 utcnow=None):

    all_observations = []
    dropped_blacklisted = 0
    dropped_malformed = 0
    dropped_overflow = 0
    stats_client = get_stats_client()
    new_stations = 0
    if utcnow is None:
        utcnow = util.utcnow()

    # Process entries and group by validated station key
    station_observations = defaultdict(list)
    for entry in entries:
        entry['created'] = utcnow
        obs = observation_model.create(entry)

        if not obs:
            dropped_malformed += 1
            continue

        station_observations[obs.hashkey()].append(obs)

    # Process observations one station at a time
    for key, observations in station_observations.items():

        first_blacklisted = None
        incomplete = False
        is_new_station = False

        # Figure out how much space is left for this station.
        free = available_station_space(session, key, station_model,
                                       max_observations_per_station)
        if free is None:
            is_new_station = True
            free = max_observations_per_station

        if is_new_station:
            # Drop observations for blacklisted stations.
            blacklisted, first_blacklisted = blacklisted_station(
                session, key, blacklist_model, utcnow)
            if blacklisted:
                dropped_blacklisted += len(observations)
                continue

            incomplete = incomplete_observation(station_type, key)
            if not incomplete:
                # We discovered an actual new complete station.
                new_stations += 1

        # Accept observations up to input-throttling limit, then drop.
        num = 0
        for obs in observations:
            if free <= 0:
                dropped_overflow += 1
                continue
            all_observations.append(obs)
            free -= 1
            num += 1

        # Accept incomplete observations, just don't make stations for them.
        # (station creation is a side effect of count-updating)
        if not incomplete and num > 0:
            create_or_update_station(session, key, station_model, utcnow, num,
                                     first_blacklisted)

    # Credit the user with discovering any new stations.
    if userid is not None and new_stations > 0:
        process_score(userid, new_stations, session, key='new_' + station_type)

    if dropped_blacklisted != 0:
        stats_client.incr('items.dropped.%s_ingress_blacklisted' %
                          station_type,
                          count=dropped_blacklisted)

    if dropped_malformed != 0:
        stats_client.incr('items.dropped.%s_ingress_malformed' % station_type,
                          count=dropped_malformed)

    if dropped_overflow != 0:
        stats_client.incr('items.dropped.%s_ingress_overflow' % station_type,
                          count=dropped_overflow)

    stats_client.incr('items.inserted.%s_observations' % station_type,
                      count=len(all_observations))

    session.add_all(all_observations)
    return all_observations
Beispiel #17
0
 def stats_client(self):
     return get_stats_client()
Beispiel #18
0
def search_all_sources(session,
                       api_name,
                       data,
                       client_addr=None,
                       geoip_db=None,
                       api_key_log=False,
                       api_key_name=None):
    """
    Common code-path for all lookup APIs, using
    WiFi, cell, cell-lac and GeoIP data sources.

    :param session: A database session for queries.
    :param api_name: A string to use in metrics (for example "geolocate").
    :param data: A dict conforming to the search API.
    :param client_addr: The IP address the request came from.
    :param geoip_db: The geoip database.
    """

    stats_client = get_stats_client()
    heka_client = get_heka_client()

    result = None
    result_metric = None

    validated = {
        'wifi': [],
        'cell': [],
        'cell_lac': set(),
        'cell_network': [],
        'cell_lac_network': [],
    }

    # Pre-process wifi data
    for wifi in data.get('wifi', ()):
        wifi = normalized_wifi_dict(wifi)
        if wifi:
            validated['wifi'].append(wifi)

    # Pre-process cell data
    radio = RADIO_TYPE.get(data.get('radio', ''), -1)
    for cell in data.get('cell', ()):
        cell = normalized_cell_dict(cell, default_radio=radio)
        if cell:
            cell_key = to_cellkey(cell)
            validated['cell'].append(cell_key)
            validated['cell_lac'].add(cell_key._replace(cid=CELLID_LAC))

    # Merge all possible cell and lac keys into one list
    all_cell_keys = []
    all_cell_keys.extend(validated['cell'])
    for key in validated['cell_lac']:
        all_cell_keys.append(key)

    # Do a single query for all cells and lacs at the same time
    try:
        all_networks = query_cell_networks(session, all_cell_keys)
    except Exception:
        heka_client.raven(RAVEN_ERROR)
        all_networks = []
    for network in all_networks:
        if network.key == CELLID_LAC:
            validated['cell_lac_network'].append(network)
        else:
            validated['cell_network'].append(network)

    # Always do a GeoIP lookup because it is cheap and we want to
    # report geoip vs. other data mismatches. We may also use
    # the full GeoIP City-level estimate as well, if all else fails.
    (geoip_res,
     countries) = geoip_and_best_guess_country_codes(validated['cell'],
                                                     api_name, client_addr,
                                                     geoip_db)

    # First we attempt a "zoom-in" from cell-lac, to cell
    # to wifi, tightening our estimate each step only so
    # long as it doesn't contradict the existing best-estimate
    # nor the possible countries of origin.

    for (data_field, object_field, metric_name, search_fn) in [
        ('cell_lac', 'cell_lac_network', 'cell_lac', search_cell_lac),
        ('cell', 'cell_network', 'cell', search_cell),
        ('wifi', 'wifi', 'wifi', search_wifi)
    ]:

        if validated[data_field]:
            r = None
            try:
                r = search_fn(session, validated[object_field])
            except Exception:
                heka_client.raven(RAVEN_ERROR)
                stats_client.incr('%s.%s_error' % (api_name, metric_name))

            if r is None:
                stats_client.incr('%s.no_%s_found' % (api_name, metric_name))

            else:
                lat = float(r['lat'])
                lon = float(r['lon'])

                stats_client.incr('%s.%s_found' % (api_name, metric_name))

                # Skip any hit that matches none of the possible countries.
                country_match = False
                for country in countries:
                    if location_is_in_country(lat, lon, country, 1):
                        country_match = True
                        break

                if countries and not country_match:
                    stats_client.incr('%s.anomaly.%s_country_mismatch' %
                                      (api_name, metric_name))

                # Always accept the first result we get.
                if result is None:
                    result = r
                    result_metric = metric_name

                # Or any result that appears to be an improvement over the
                # existing best guess.
                elif (distance(float(result['lat']), float(result['lon']), lat,
                               lon) * 1000 <= result['accuracy']):
                    result = r
                    result_metric = metric_name

                else:
                    stats_client.incr('%s.anomaly.%s_%s_mismatch' %
                                      (api_name, metric_name, result_metric))

    # Fall back to GeoIP if nothing has worked yet. We do not
    # include this in the "zoom-in" loop because GeoIP is
    # frequently _wrong_ at the city level; we only want to
    # accept that estimate if we got nothing better from cell
    # or wifi.
    if not result and geoip_res:
        result = geoip_res
        result_metric = 'geoip'

    # Do detailed logging for some api keys
    if api_key_log and api_key_name:
        api_log_metric = None
        wifi_keys = set([w['key'] for w in validated['wifi']])
        if wifi_keys and \
           len(filter_bssids_by_similarity(wifi_keys)) >= MIN_WIFIS_IN_QUERY:
            # Only count requests as WiFi-based if they contain enough
            # distinct WiFi networks to pass our filters
            if result_metric == 'wifi':
                api_log_metric = 'wifi_hit'
            else:
                api_log_metric = 'wifi_miss'
        elif validated['cell']:
            if result_metric == 'cell':
                api_log_metric = 'cell_hit'
            elif result_metric == 'cell_lac':
                api_log_metric = 'cell_lac_hit'
            else:
                api_log_metric = 'cell_miss'
        else:
            if geoip_res:
                api_log_metric = 'geoip_hit'
            else:
                api_log_metric = 'geoip_miss'
        if api_log_metric:
            stats_client.incr('%s.api_log.%s.%s' %
                              (api_name, api_key_name, api_log_metric))

    if not result:
        stats_client.incr('%s.miss' % api_name)
        return None

    rounded_result = {
        'lat': round(result['lat'], DEGREE_DECIMAL_PLACES),
        'lon': round(result['lon'], DEGREE_DECIMAL_PLACES),
        'accuracy': round(result['accuracy'], DEGREE_DECIMAL_PLACES),
    }

    stats_client.incr('%s.%s_hit' % (api_name, result_metric))
    stats_client.timing('%s.accuracy.%s' % (api_name, result_metric),
                        rounded_result['accuracy'])

    return rounded_result
Beispiel #19
0
def process_measures(items, session, userid=None):
    stats_client = get_stats_client()
    positions = []
    cell_measures = []
    wifi_measures = []
    for i, item in enumerate(items):
        item['report_id'] = uuid.uuid1().hex
        cell, wifi = process_measure(item, session)
        cell_measures.extend(cell)
        wifi_measures.extend(wifi)
        if cell or wifi:
            positions.append({
                'lat': item['lat'],
                'lon': item['lon'],
            })

    if cell_measures:
        # group by and create task per cell key
        stats_client.incr("items.uploaded.cell_measures",
                          len(cell_measures))
        cells = defaultdict(list)
        for measure in cell_measures:
            cells[to_cellkey_psc(measure)].append(measure)

        # Create a task per group of 5 cell keys at a time.
        # Grouping them helps in avoiding per-task overhead.
        cells = list(cells.values())
        batch_size = 5
        countdown = 0
        for i in range(0, len(cells), batch_size):
            values = []
            for measures in cells[i:i + batch_size]:
                values.extend(measures)
            # insert measures, expire the task if it wasn't processed
            # after six hours to avoid queue overload, also delay
            # each task by one second more, to get a more even workload
            # and avoid parallel updates of the same underlying stations
            insert_measures_cell.apply_async(
                args=[values],
                kwargs={'userid': userid},
                expires=21600,
                countdown=countdown)
            countdown += 1

    if wifi_measures:
        # group by WiFi key
        stats_client.incr("items.uploaded.wifi_measures",
                          len(wifi_measures))
        wifis = defaultdict(list)
        for measure in wifi_measures:
            wifis[measure['key']].append(measure)

        # Create a task per group of 20 WiFi keys at a time.
        # We tend to get a huge number of unique WiFi networks per
        # batch upload, with one to very few measures per WiFi.
        # Grouping them helps in avoiding per-task overhead.
        wifis = list(wifis.values())
        batch_size = 20
        countdown = 0
        for i in range(0, len(wifis), batch_size):
            values = []
            for measures in wifis[i:i + batch_size]:
                values.extend(measures)
            # insert measures, expire the task if it wasn't processed
            # after six hours to avoid queue overload, also delay
            # each task by one second more, to get a more even workload
            # and avoid parallel updates of the same underlying stations
            insert_measures_wifi.apply_async(
                args=[values],
                kwargs={'userid': userid},
                expires=21600,
                countdown=countdown)
            countdown += 1

    if userid is not None:
        process_score(userid, len(positions), session)
    if positions:
        process_mapstat(session, positions)
Beispiel #20
0
def process_station_measures(session, entries, station_type,
                             station_model, measure_model, blacklist_model,
                             create_measure, create_key, join_key,
                             userid=None, max_measures_per_station=11000,
                             utcnow=None):

    all_measures = []
    dropped_blacklisted = 0
    dropped_malformed = 0
    dropped_overflow = 0
    stats_client = get_stats_client()
    new_stations = 0
    if utcnow is None:
        utcnow = util.utcnow()
    elif isinstance(utcnow, basestring):
        utcnow = decode_datetime(utcnow)

    # Process entries and group by validated station key
    station_measures = defaultdict(list)
    for entry in entries:
        measure = create_measure(utcnow, entry)

        if not measure:
            dropped_malformed += 1
            continue

        station_measures[create_key(measure)].append(measure)

    # Process measures one station at a time
    for key, measures in station_measures.items():

        incomplete = False
        is_new_station = False

        # Figure out how much space is left for this station.
        free = available_station_space(session, key, station_model,
                                       join_key, max_measures_per_station)
        if free is None:
            is_new_station = True
            free = max_measures_per_station

        if is_new_station:
            # Drop measures for blacklisted stations.
            if blacklisted_station(session, key, blacklist_model,
                                   join_key, utcnow):
                dropped_blacklisted += len(measures)
                continue

            incomplete = incomplete_measure(key)
            if not incomplete:
                # We discovered an actual new complete station.
                new_stations += 1

        # Accept measures up to input-throttling limit, then drop.
        num = 0
        for measure in measures:
            if free <= 0:
                dropped_overflow += 1
                continue
            all_measures.append(measure)
            free -= 1
            num += 1

        # Accept incomplete measures, just don't make stations for them.
        # (station creation is a side effect of count-updating)
        if not incomplete and num > 0:
            create_or_update_station(session, key, station_model,
                                     join_key, utcnow, num)

    # Credit the user with discovering any new stations.
    if userid is not None and new_stations > 0:
        process_score(userid, new_stations, session,
                      key='new_' + station_type)

    if dropped_blacklisted != 0:
        stats_client.incr(
            "items.dropped.%s_ingress_blacklisted" % station_type,
            count=dropped_blacklisted)

    if dropped_malformed != 0:
        stats_client.incr(
            "items.dropped.%s_ingress_malformed" % station_type,
            count=dropped_malformed)

    if dropped_overflow != 0:
        stats_client.incr(
            "items.dropped.%s_ingress_overflow" % station_type,
            count=dropped_overflow)

    stats_client.incr(
        "items.inserted.%s_measures" % station_type,
        count=len(all_measures))

    session.add_all(all_measures)
    return all_measures