def closure(request, *args, **kwargs): api_key = request.GET.get('key', None) heka_client = get_heka_client() stats_client = request.registry.stats_client if api_key is None: stats_client.incr('%s.no_api_key' % func_name) if error_on_invalidkey: return invalid_api_key_response() session = request.db_slave_session try: result = session.execute(API_CHECK.bindparams(api_key=api_key)) found_key = result.fetchone() except Exception: # pragma: no cover # if we cannot connect to backend DB, skip api key check heka_client.raven(RAVEN_ERROR) stats_client.incr('%s.dbfailure_skip_api_key' % func_name) return func(request, *args, **kwargs) if found_key is not None: maxreq, api_key_log, shortname = found_key if not shortname: # pragma: no cover shortname = api_key # remember api key and shortname on the request request.api_key_log = bool(api_key_log) request.api_key_name = shortname stats_client.incr('%s.api_key.%s' % (func_name, shortname)) should_limit = rate_limit(request.registry.redis_client, api_key, maxreq=maxreq) if should_limit: result = HTTPForbidden() result.content_type = 'application/json' result.body = DAILY_LIMIT return result elif should_limit is None: # pragma: no cover # We couldn't connect to Redis stats_client.incr('%s.redisfailure_skip_limit' % func_name) else: stats_client.incr('%s.unknown_api_key' % func_name) if error_on_invalidkey: return invalid_api_key_response() # provide the same api log/name attributes request.api_key_log = False request.api_key_name = None return func(request, *args, **kwargs)
def __init__(self, api_key_name, api_key_log, api_name): """ A StatsLogger sends counted and timed named statistics to a statistic aggregator client. :param api_key_name: Human readable API key name (for example 'test_1') :type api_key_name: str :param api_key_log: Gather additional API key specific stats? :type api_key_log: bool :param api_name: Name of the API, used as stats prefix (for example 'geolocate') :type api_name: str """ self.api_key_name = api_key_name self.api_key_log = api_key_log self.api_name = api_name self.heka_client = get_heka_client() self.stats_client = get_stats_client()
def closure(request, *args, **kwargs): api_key = request.GET.get('key', None) heka_client = get_heka_client() stats_client = request.registry.stats_client if api_key is None: stats_client.incr('%s.no_api_key' % func_name) if error_on_invalidkey: return invalid_api_key_response() session = request.db_slave_session try: result = session.execute(API_CHECK.bindparams(api_key=api_key)) found_key = result.fetchone() except Exception: # if we cannot connect to backend DB, skip api key check heka_client.raven(RAVEN_ERROR) stats_client.incr('%s.dbfailure_skip_api_key' % func_name) return func(request, *args, **kwargs) if found_key is not None: maxreq, shortname = found_key if not shortname: shortname = api_key stats_client.incr('%s.api_key.%s' % (func_name, shortname)) should_limit = rate_limit(request.registry.redis_client, api_key, maxreq=maxreq) if should_limit: result = HTTPForbidden() result.content_type = 'application/json' result.body = DAILY_LIMIT return result elif should_limit is None: # We couldn't connect to Redis stats_client.incr('%s.redisfailure_skip_limit' % func_name) else: stats_client.incr('%s.unknown_api_key' % func_name) if error_on_invalidkey: return invalid_api_key_response() return func(request, *args, **kwargs)
def search_all_sources(session, api_name, data, client_addr=None, geoip_db=None, api_key_log=False, api_key_name=None, result_type='position'): """ Common code-path for all lookup APIs, using WiFi, cell, cell-lac and GeoIP data sources. :param session: A database session for queries. :param api_name: A string to use in metrics (for example "geolocate"). :param data: A dict conforming to the search API. :param client_addr: The IP address the request came from. :param geoip_db: The geoip database. :param api_key_log: Enable additional api key specific logging? :param api_key_name: The metric friendly api key name. :param result_type: What kind of result to return, either a lat/lon position or a country estimate. """ if result_type not in ('country', 'position'): raise ValueError('Invalid result_type, must be one of ' 'position or country') stats_client = get_stats_client() heka_client = get_heka_client() result = None result_metric = None validated = { 'wifi': [], 'cell': [], 'cell_lac': set(), 'cell_network': [], 'cell_lac_network': [], } # Pre-process wifi data for wifi in data.get('wifi', ()): wifi = normalized_wifi_dict(wifi) if wifi: validated['wifi'].append(wifi) # Pre-process cell data radio = RADIO_TYPE.get(data.get('radio', ''), -1) for cell in data.get('cell', ()): cell = normalized_cell_dict(cell, default_radio=radio) if cell: cell_key = to_cellkey(cell) validated['cell'].append(cell_key) validated['cell_lac'].add(cell_key) found_cells = [] # Query all cells and OCID cells for model in Cell, OCIDCell, CellArea: cell_filter = [] for key in validated['cell']: # create a list of 'and' criteria for cell keys criterion = join_cellkey(model, key) cell_filter.append(and_(*criterion)) if cell_filter: # only do a query if we have cell results, or this will match # all rows in the table load_fields = ('radio', 'mcc', 'mnc', 'lac', 'lat', 'lon', 'range') query = (session.query(model).options( load_only(*load_fields)).filter(or_(*cell_filter)).filter( model.lat.isnot(None)).filter(model.lon.isnot(None))) try: found_cells.extend(query.all()) except Exception: heka_client.raven(RAVEN_ERROR) if found_cells: # Group all found_cellss by location area lacs = defaultdict(list) for cell in found_cells: cellarea_key = (cell.radio, cell.mcc, cell.mnc, cell.lac) lacs[cellarea_key].append(cell) def sort_lac(v): # use the lac with the most values, # or the one with the smallest range return (len(v), -min([e.range for e in v])) # If we get data from multiple location areas, use the one with the # most data points in it. That way a lac with a cell hit will # have two entries and win over a lac with only the lac entry. lac = sorted(lacs.values(), key=sort_lac, reverse=True) for cell in lac[0]: # The first entry is the key, # used only to distinguish cell from lac network = Network(key=None, lat=cell.lat, lon=cell.lon, range=cell.range) if type(cell) is CellArea: validated['cell_lac_network'].append(network) else: validated['cell_network'].append(network) # Always do a GeoIP lookup because it is cheap and we want to # report geoip vs. other data mismatches. We may also use # the full GeoIP City-level estimate as well, if all else fails. (geoip_res, countries) = geoip_and_best_guess_country_codes(validated['cell'], api_name, client_addr, geoip_db, stats_client) # First we attempt a "zoom-in" from cell-lac, to cell # to wifi, tightening our estimate each step only so # long as it doesn't contradict the existing best-estimate # nor the possible countries of origin. for (data_field, object_field, metric_name, search_fn) in [ ('cell_lac', 'cell_lac_network', 'cell_lac', search_cell_lac), ('cell', 'cell_network', 'cell', search_cell), ('wifi', 'wifi', 'wifi', search_wifi) ]: if validated[data_field]: r = None try: r = search_fn(session, validated[object_field], stats_client, api_name) except Exception: heka_client.raven(RAVEN_ERROR) stats_client.incr('%s.%s_error' % (api_name, metric_name)) if r is None: stats_client.incr('%s.no_%s_found' % (api_name, metric_name)) else: lat = float(r['lat']) lon = float(r['lon']) stats_client.incr('%s.%s_found' % (api_name, metric_name)) # Skip any hit that matches none of the possible countries. country_match = False for country in countries: if location_is_in_country(lat, lon, country, 1): country_match = True break if countries and not country_match: stats_client.incr('%s.anomaly.%s_country_mismatch' % (api_name, metric_name)) # Always accept the first result we get. if result is None: result = r result_metric = metric_name # Or any result that appears to be an improvement over the # existing best guess. elif (distance(float(result['lat']), float(result['lon']), lat, lon) * 1000 <= result['accuracy']): result = r result_metric = metric_name else: stats_client.incr('%s.anomaly.%s_%s_mismatch' % (api_name, metric_name, result_metric)) # Fall back to GeoIP if nothing has worked yet. We do not # include this in the "zoom-in" loop because GeoIP is # frequently _wrong_ at the city level; we only want to # accept that estimate if we got nothing better from cell # or wifi. if not result and geoip_res: result = geoip_res result_metric = 'geoip' # Do detailed logging for some api keys if api_key_log and api_key_name: api_log_metric = None wifi_keys = set([w['key'] for w in validated['wifi']]) if wifi_keys and \ len(filter_bssids_by_similarity(wifi_keys)) >= MIN_WIFIS_IN_QUERY: # Only count requests as WiFi-based if they contain enough # distinct WiFi networks to pass our filters if result_metric == 'wifi': api_log_metric = 'wifi_hit' else: api_log_metric = 'wifi_miss' elif validated['cell']: if result_metric == 'cell': api_log_metric = 'cell_hit' elif result_metric == 'cell_lac': api_log_metric = 'cell_lac_hit' else: api_log_metric = 'cell_miss' else: if geoip_res: api_log_metric = 'geoip_hit' else: api_log_metric = 'geoip_miss' if api_log_metric: stats_client.incr('%s.api_log.%s.%s' % (api_name, api_key_name, api_log_metric)) if not result: stats_client.incr('%s.miss' % api_name) return None stats_client.incr('%s.%s_hit' % (api_name, result_metric)) if result_type == 'position': rounded_result = { 'lat': round(result['lat'], DEGREE_DECIMAL_PLACES), 'lon': round(result['lon'], DEGREE_DECIMAL_PLACES), 'accuracy': round(result['accuracy'], DEGREE_DECIMAL_PLACES), } stats_client.timing('%s.accuracy.%s' % (api_name, result_metric), rounded_result['accuracy']) return rounded_result elif result_type == 'country': if countries: country = iso3166.countries.get(countries[0]) return { 'country_name': country.name, 'country_code': country.alpha2 }
def heka_client(self): return get_heka_client()
def search_all_sources(session, api_name, data, client_addr=None, geoip_db=None): """ Common code-path for all lookup APIs, using WiFi, cell, cell-lac and GeoIP data sources. :param session: A database session for queries. :param api_name: A string to use in metrics (for example "geolocate"). :param data: A dict conforming to the search API. :param client_addr: The IP address the request came from. :param geoip_db: The geoip database. """ stats_client = get_stats_client() heka_client = get_heka_client() result = None result_metric = None validated = { 'wifi': [], 'cell': [], 'cell_lac': set(), 'cell_network': [], 'cell_lac_network': [], } # Pass-through wifi data validated['wifi'] = data.get('wifi', []) # Pre-process cell data radio = RADIO_TYPE.get(data.get('radio', ''), -1) for cell in data.get('cell', ()): cell = normalized_cell_dict(cell, default_radio=radio) if cell: cell_key = to_cellkey(cell) validated['cell'].append(cell_key) validated['cell_lac'].add(cell_key._replace(cid=CELLID_LAC)) # Merge all possible cell and lac keys into one list all_cell_keys = [] all_cell_keys.extend(validated['cell']) for key in validated['cell_lac']: all_cell_keys.append(key) # Do a single query for all cells and lacs at the same time try: all_networks = query_cell_networks(session, all_cell_keys) except Exception: heka_client.raven(RAVEN_ERROR) all_networks = [] for network in all_networks: if network.key == CELLID_LAC: validated['cell_lac_network'].append(network) else: validated['cell_network'].append(network) # Always do a GeoIP lookup because it is cheap and we want to # report geoip vs. other data mismatches. We may also use # the full GeoIP City-level estimate as well, if all else fails. (geoip_res, countries) = geoip_and_best_guess_country_codes( validated['cell'], api_name, client_addr, geoip_db) # First we attempt a "zoom-in" from cell-lac, to cell # to wifi, tightening our estimate each step only so # long as it doesn't contradict the existing best-estimate # nor the possible countries of origin. for (data_field, object_field, metric_name, search_fn) in [ ('cell_lac', 'cell_lac_network', 'cell_lac', search_cell_lac), ('cell', 'cell_network', 'cell', search_cell), ('wifi', 'wifi', 'wifi', search_wifi)]: if validated[data_field]: r = None try: r = search_fn(session, validated[object_field]) except Exception: heka_client.raven(RAVEN_ERROR) stats_client.incr('%s.%s_error' % (api_name, metric_name)) if r is None: stats_client.incr('%s.no_%s_found' % (api_name, metric_name)) else: lat = float(r['lat']) lon = float(r['lon']) stats_client.incr('%s.%s_found' % (api_name, metric_name)) # Skip any hit that matches none of the possible countries. country_match = False for country in countries: if location_is_in_country(lat, lon, country, 1): country_match = True break if countries and not country_match: stats_client.incr('%s.anomaly.%s_country_mismatch' % (api_name, metric_name)) # Always accept the first result we get. if result is None: result = r result_metric = metric_name # Or any result that appears to be an improvement over the # existing best guess. elif (distance(float(result['lat']), float(result['lon']), lat, lon) * 1000 <= result['accuracy']): result = r result_metric = metric_name else: stats_client.incr('%s.anomaly.%s_%s_mismatch' % (api_name, metric_name, result_metric)) # Fall back to GeoIP if nothing has worked yet. We do not # include this in the "zoom-in" loop because GeoIP is # frequently _wrong_ at the city level; we only want to # accept that estimate if we got nothing better from cell # or wifi. if not result and geoip_res: result = geoip_res result_metric = 'geoip' if not result: stats_client.incr('%s.miss' % api_name) return None rounded_result = { 'lat': round(result['lat'], DEGREE_DECIMAL_PLACES), 'lon': round(result['lon'], DEGREE_DECIMAL_PLACES), 'accuracy': round(result['accuracy'], DEGREE_DECIMAL_PLACES), } stats_client.incr('%s.%s_hit' % (api_name, result_metric)) stats_client.timing('%s.accuracy.%s' % (api_name, result_metric), rounded_result['accuracy']) return rounded_result
def search_all_sources(session, api_name, data, client_addr=None, geoip_db=None, api_key_log=False, api_key_name=None): """ Common code-path for all lookup APIs, using WiFi, cell, cell-lac and GeoIP data sources. :param session: A database session for queries. :param api_name: A string to use in metrics (for example "geolocate"). :param data: A dict conforming to the search API. :param client_addr: The IP address the request came from. :param geoip_db: The geoip database. """ stats_client = get_stats_client() heka_client = get_heka_client() result = None result_metric = None validated = { 'wifi': [], 'cell': [], 'cell_lac': set(), 'cell_network': [], 'cell_lac_network': [], } # Pre-process wifi data for wifi in data.get('wifi', ()): wifi = normalized_wifi_dict(wifi) if wifi: validated['wifi'].append(wifi) # Pre-process cell data radio = RADIO_TYPE.get(data.get('radio', ''), -1) for cell in data.get('cell', ()): cell = normalized_cell_dict(cell, default_radio=radio) if cell: cell_key = to_cellkey(cell) validated['cell'].append(cell_key) validated['cell_lac'].add(cell_key._replace(cid=CELLID_LAC)) # Merge all possible cell and lac keys into one list all_cell_keys = [] all_cell_keys.extend(validated['cell']) for key in validated['cell_lac']: all_cell_keys.append(key) # Do a single query for all cells and lacs at the same time try: all_networks = query_cell_networks(session, all_cell_keys) except Exception: heka_client.raven(RAVEN_ERROR) all_networks = [] for network in all_networks: if network.key == CELLID_LAC: validated['cell_lac_network'].append(network) else: validated['cell_network'].append(network) # Always do a GeoIP lookup because it is cheap and we want to # report geoip vs. other data mismatches. We may also use # the full GeoIP City-level estimate as well, if all else fails. (geoip_res, countries) = geoip_and_best_guess_country_codes(validated['cell'], api_name, client_addr, geoip_db) # First we attempt a "zoom-in" from cell-lac, to cell # to wifi, tightening our estimate each step only so # long as it doesn't contradict the existing best-estimate # nor the possible countries of origin. for (data_field, object_field, metric_name, search_fn) in [ ('cell_lac', 'cell_lac_network', 'cell_lac', search_cell_lac), ('cell', 'cell_network', 'cell', search_cell), ('wifi', 'wifi', 'wifi', search_wifi) ]: if validated[data_field]: r = None try: r = search_fn(session, validated[object_field]) except Exception: heka_client.raven(RAVEN_ERROR) stats_client.incr('%s.%s_error' % (api_name, metric_name)) if r is None: stats_client.incr('%s.no_%s_found' % (api_name, metric_name)) else: lat = float(r['lat']) lon = float(r['lon']) stats_client.incr('%s.%s_found' % (api_name, metric_name)) # Skip any hit that matches none of the possible countries. country_match = False for country in countries: if location_is_in_country(lat, lon, country, 1): country_match = True break if countries and not country_match: stats_client.incr('%s.anomaly.%s_country_mismatch' % (api_name, metric_name)) # Always accept the first result we get. if result is None: result = r result_metric = metric_name # Or any result that appears to be an improvement over the # existing best guess. elif (distance(float(result['lat']), float(result['lon']), lat, lon) * 1000 <= result['accuracy']): result = r result_metric = metric_name else: stats_client.incr('%s.anomaly.%s_%s_mismatch' % (api_name, metric_name, result_metric)) # Fall back to GeoIP if nothing has worked yet. We do not # include this in the "zoom-in" loop because GeoIP is # frequently _wrong_ at the city level; we only want to # accept that estimate if we got nothing better from cell # or wifi. if not result and geoip_res: result = geoip_res result_metric = 'geoip' # Do detailed logging for some api keys if api_key_log and api_key_name: api_log_metric = None wifi_keys = set([w['key'] for w in validated['wifi']]) if wifi_keys and \ len(filter_bssids_by_similarity(wifi_keys)) >= MIN_WIFIS_IN_QUERY: # Only count requests as WiFi-based if they contain enough # distinct WiFi networks to pass our filters if result_metric == 'wifi': api_log_metric = 'wifi_hit' else: api_log_metric = 'wifi_miss' elif validated['cell']: if result_metric == 'cell': api_log_metric = 'cell_hit' elif result_metric == 'cell_lac': api_log_metric = 'cell_lac_hit' else: api_log_metric = 'cell_miss' else: if geoip_res: api_log_metric = 'geoip_hit' else: api_log_metric = 'geoip_miss' if api_log_metric: stats_client.incr('%s.api_log.%s.%s' % (api_name, api_key_name, api_log_metric)) if not result: stats_client.incr('%s.miss' % api_name) return None rounded_result = { 'lat': round(result['lat'], DEGREE_DECIMAL_PLACES), 'lon': round(result['lon'], DEGREE_DECIMAL_PLACES), 'accuracy': round(result['accuracy'], DEGREE_DECIMAL_PLACES), } stats_client.incr('%s.%s_hit' % (api_name, result_metric)) stats_client.timing('%s.accuracy.%s' % (api_name, result_metric), rounded_result['accuracy']) return rounded_result