def mark_moving_cells(session, moving_cells): moving_keys = [] blacklist = set() for cell in moving_cells: query = session.query(CellBlacklist).filter( *join_cellkey(CellBlacklist, cell)) b = query.first() if b is None: key = to_cellkey(cell)._asdict() blacklist.add(CellBlacklist(**key)) moving_keys.append(key) get_heka_client().incr("items.blacklisted.cell_moving", len(moving_keys)) session.add_all(blacklist) remove_cell.delay(moving_keys)
def submit_view(request): api_key = request.GET.get('key', None) heka_client = get_heka_client() if api_key is None: # we don't require API keys for submit yet heka_client.incr('submit.no_api_key') else: heka_client.incr('submit.api_key.%s' % api_key.replace('.', '__')) data, errors = preprocess_request( request, schema=SubmitSchema(), extra_checks=(submit_validator, ), ) items = data['items'] nickname = request.headers.get('X-Nickname', u'') if isinstance(nickname, str): nickname = nickname.decode('utf-8', 'ignore') # batch incoming data into multiple tasks, in case someone # manages to submit us a huge single request for i in range(0, len(items), 100): insert_measures.delay( # TODO convert items to json with support for decimal/datetime items=dumps(items[i:i + 100]), nickname=nickname, ) return HTTPNoContent()
def mark_moving_wifis(session, moving_wifis): moving_keys = set([wifi.key for wifi in moving_wifis]) utcnow = datetime.utcnow() query = session.query(WifiBlacklist.key).filter( WifiBlacklist.key.in_(moving_keys)) already_blocked = set([a[0] for a in query.all()]) moving_keys = moving_keys - already_blocked if not moving_keys: return for key in moving_keys: # on duplicate key, do a no-op change stmt = WifiBlacklist.__table__.insert( on_duplicate='created=created').values(key=key, created=utcnow) session.execute(stmt) get_heka_client().incr("items.blacklisted.wifi_moving", len(moving_keys)) remove_wifi.delay(list(moving_keys))
def process_measures(items, session, userid=None): utcnow = datetime.datetime.utcnow().replace(tzinfo=iso8601.UTC) utcmin = utcnow - datetime.timedelta(60) # get enough auto-increment ids assigned measures = [] for i in range(len(items)): measure = Measure() measures.append(measure) session.add(measure) # TODO switch unique measure id to a uuid, so we don't have to do # get these from a savepoint here session.flush() positions = [] cell_measures = [] wifi_measures = [] for i, item in enumerate(items): item = process_time(item, utcnow, utcmin) cell, wifi = process_measure(measures[i].id, item, session) cell_measures.extend(cell) wifi_measures.extend(wifi) positions.append({ 'lat': to_precise_int(item['lat']), 'lon': to_precise_int(item['lon']), }) heka_client = get_heka_client() if cell_measures: # group by and create task per cell key heka_client.incr("items.uploaded.cell_measures", len(cell_measures)) cells = defaultdict(list) for measure in cell_measures: cell_key = CellKey(measure['radio'], measure['mcc'], measure['mnc'], measure['lac'], measure['cid'], measure['psc']) cells[cell_key].append(measure) for values in cells.values(): insert_cell_measures.delay(values, userid=userid) if wifi_measures: # group by and create task per wifi key heka_client.incr("items.uploaded.wifi_measures", len(wifi_measures)) wifis = defaultdict(list) for measure in wifi_measures: wifis[measure['key']].append(measure) for values in wifis.values(): insert_wifi_measures.delay(values, userid=userid) if userid is not None: process_score(userid, len(items), session) if positions: process_mapstat(positions, session, userid=userid)
def mark_moving_wifis(session, moving_wifis): moving_keys = set([wifi.key for wifi in moving_wifis]) utcnow = datetime.utcnow() query = session.query(WifiBlacklist.key).filter( WifiBlacklist.key.in_(moving_keys)) already_blocked = set([a[0] for a in query.all()]) moving_keys = moving_keys - already_blocked if not moving_keys: return for key in moving_keys: # on duplicate key, do a no-op change stmt = WifiBlacklist.__table__.insert( on_duplicate='created=created').values( key=key, created=utcnow) session.execute(stmt) get_heka_client().incr("items.blacklisted.wifi_moving", len(moving_keys)) remove_wifi.delay(list(moving_keys))
def geolocate_view(request): heka_client = get_heka_client() data, errors = preprocess_request( request, schema=GeoLocateSchema(), extra_checks=(geolocate_validator, ), response=JSONError, accept_empty=True, ) session = request.db_slave_session result = None if data and data['wifiAccessPoints']: result = search_wifi_ap(session, data) if result is not None: heka_client.incr('geolocate.wifi_hit') heka_client.timer_send('geolocate.accuracy.wifi', result['accuracy']) elif data: result = search_cell_tower(session, data) if result is not None: heka_client.incr('geolocate.cell_hit') heka_client.timer_send('geolocate.accuracy.cell', result['accuracy']) if result is None: result = search_cell_tower_lac(session, data) if result is not None: heka_client.incr('geolocate.cell_lac_hit') heka_client.timer_send('geolocate.accuracy.cell_lac', result['accuracy']) if result is None and request.client_addr: result = search_geoip(request.registry.geoip_db, request.client_addr) if result is not None: heka_client.incr('geolocate.geoip_hit') heka_client.timer_send('geolocate.accuracy.geoip', result['accuracy']) if result is None: heka_client.incr('geolocate.miss') result = HTTPNotFound() result.content_type = 'application/json' result.body = NOT_FOUND return result return { "location": { "lat": result['lat'], "lng": result['lon'], }, "accuracy": float(result['accuracy']), }
def process_measures(items, session, userid=None): utcnow = datetime.datetime.utcnow().replace(tzinfo=iso8601.UTC) utcmin = utcnow - datetime.timedelta(60) # get enough auto-increment ids assigned measures = [] for i in range(len(items)): measure = Measure() measures.append(measure) session.add(measure) # TODO switch unique measure id to a uuid, so we don't have to do # get these from a savepoint here session.flush() positions = [] cell_measures = [] wifi_measures = [] for i, item in enumerate(items): item = process_time(item, utcnow, utcmin) cell, wifi = process_measure(measures[i].id, item, session) cell_measures.extend(cell) wifi_measures.extend(wifi) positions.append({ 'lat': to_precise_int(item['lat']), 'lon': to_precise_int(item['lon']), }) heka_client = get_heka_client() if cell_measures: # group by and create task per cell key heka_client.incr("items.uploaded.cell_measures", len(cell_measures)) cells = defaultdict(list) for measure in cell_measures: cells[to_cellkey_psc(measure)].append(measure) for values in cells.values(): insert_cell_measures.delay(values, userid=userid) if wifi_measures: # group by and create task per wifi key heka_client.incr("items.uploaded.wifi_measures", len(wifi_measures)) wifis = defaultdict(list) for measure in wifi_measures: wifis[measure['key']].append(measure) for values in wifis.values(): insert_wifi_measures.delay(values, userid=userid) if userid is not None: process_score(userid, len(items), session) if positions: process_mapstat(positions, session, userid=userid)
def search_view(request): heka_client = get_heka_client() data, errors = preprocess_request( request, schema=SearchSchema(), extra_checks=(check_cell_or_wifi, ), accept_empty=True, ) session = request.db_slave_session result = None if data and data['wifi']: result = search_wifi(session, data) if result is not None: heka_client.incr('search.wifi_hit') heka_client.timer_send('search.accuracy.wifi', result['accuracy']) if result is None and data: # no wifi result found, fall back to cell result = search_cell(session, data) if result is not None: heka_client.incr('search.cell_hit') heka_client.timer_send('search.accuracy.cell', result['accuracy']) if result is None and data: # no direct cell result found, try cell LAC result = search_cell_lac(session, data) if result is not None: heka_client.incr('search.cell_lac_hit') heka_client.timer_send('search.accuracy.cell_lac', result['accuracy']) if result is None and request.client_addr: # no cell or wifi, fall back again to geoip result = search_geoip(request.registry.geoip_db, request.client_addr) if result is not None: heka_client.incr('search.geoip_hit') heka_client.timer_send('search.accuracy.geoip', result['accuracy']) if result is None: heka_client.incr('search.miss') return {'status': 'not_found'} return { 'status': 'ok', 'lat': result['lat'], 'lon': result['lon'], 'accuracy': result['accuracy'], }
def geolocate_view(request): api_key = request.GET.get('key', None) heka_client = get_heka_client() if api_key is None: heka_client.incr('geolocate.no_api_key') result = HTTPBadRequest() result.content_type = 'application/json' result.body = NO_API_KEY return result heka_client.incr('geolocate.api_key.%s' % api_key.replace('.', '__')) data, errors = preprocess_request( request, schema=GeoLocateSchema(), extra_checks=(geolocate_validator, ), response=JSONError, ) session = request.db_slave_session result = None if data['wifiAccessPoints']: result = search_wifi_ap(session, data) if result is not None: heka_client.incr('geolocate.wifi_hit') else: result = search_cell_tower(session, data) if result is not None: heka_client.incr('geolocate.cell_hit') if result is None and request.client_addr: result = search_geoip(request.registry.geoip_db, request.client_addr) if result is not None: heka_client.incr('geolocate.geoip_hit') if result is None: heka_client.incr('geolocate.miss') result = HTTPNotFound() result.content_type = 'application/json' result.body = NOT_FOUND return result return { "location": { "lat": result['lat'], "lng": result['lon'], }, "accuracy": float(result['accuracy']), }
def search_view(request): api_key = request.GET.get('key', None) heka_client = get_heka_client() if api_key is None: # TODO: change into a better error response heka_client.incr('search.no_api_key') return {'status': 'not_found'} heka_client.incr('search.api_key') data, errors = preprocess_request( request, schema=SearchSchema(), extra_checks=(check_cell_or_wifi, ), ) session = request.db_slave_session result = None if data['wifi']: result = search_wifi(session, data) if result is not None: heka_client.incr('search.wifi_hit') if result is None: # no wifi result found, fall back to cell result = search_cell(session, data) if result is not None: heka_client.incr('search.cell_hit') if result is None and request.client_addr: # no cell or wifi, fall back again to geoip result = search_geoip(request.registry.geoip_db, request.client_addr) if result is not None: heka_client.incr('search.geoip_hit') if result is None: heka_client.incr('search.miss') return {'status': 'not_found'} return { 'status': 'ok', 'lat': result['lat'], 'lon': result['lon'], 'accuracy': result['accuracy'], }
def search_view(request): heka_client = get_heka_client() data, errors = preprocess_request( request, schema=SearchSchema(), extra_checks=(check_cell_or_wifi, ), ) session = request.db_slave_session result = None if data['wifi']: result = search_wifi(session, data) if result is not None: heka_client.incr('search.wifi_hit') if result is None: # no wifi result found, fall back to cell result = search_cell(session, data) if result is not None: heka_client.incr('search.cell_hit') if result is None: # no direct cell result found, try cell LAC result = search_cell_lac(session, data) if result is not None: heka_client.incr('search.cell_lac_hit') if result is None and request.client_addr: # no cell or wifi, fall back again to geoip result = search_geoip(request.registry.geoip_db, request.client_addr) if result is not None: heka_client.incr('search.geoip_hit') if result is None: heka_client.incr('search.miss') return {'status': 'not_found'} return { 'status': 'ok', 'lat': result['lat'], 'lon': result['lon'], 'accuracy': result['accuracy'], }
def closure(request, *args, **kwargs): api_key = request.GET.get('key', None) heka_client = get_heka_client() if api_key is None: heka_client.incr('%s.no_api_key' % func_name) if error_on_invalidkey: result = HTTPBadRequest() result.content_type = 'application/json' result.body = NO_API_KEY return result else: session = request.db_slave_session found_key_filter = session.query(ApiKey) found_key_filter = found_key_filter.filter(ApiKey.valid_key == api_key) if found_key_filter.count(): heka_client.incr('%s.api_key.%s' % (func_name, api_key.replace('.', '__'))) else: heka_client.incr('%s.unknown_api_key' % func_name) return func(request, *args, **kwargs)
def process_cell_measures(session, entries, userid=None, max_measures_per_cell=11000): cell_count = defaultdict(int) cell_measures = [] utcnow = datetime.datetime.utcnow().replace(tzinfo=iso8601.UTC) dropped_malformed = 0 dropped_overflow = 0 space_available = {} # process entries for entry in entries: cell_measure = create_cell_measure(utcnow, entry) if not cell_measure: dropped_malformed += 1 continue cell_key = to_cellkey_psc(cell_measure) # check if there's space for new measurement within per-cell maximum # note: old measures gradually expire, so this is an intake-rate limit if cell_key not in space_available: query = session.query(Cell.total_measures).filter( *join_cellkey(Cell, cell_key)) curr = query.first() if curr is not None: space_available[cell_key] = max_measures_per_cell - curr[0] else: space_available[cell_key] = max_measures_per_cell if space_available[cell_key] > 0: space_available[cell_key] -= 1 else: dropped_overflow += 1 continue # Possibly drop measure if we're receiving them too # quickly for this cell. query = session.query(Cell.total_measures).filter( *join_cellkey(Cell, cell_key)) total_measures = query.first() if total_measures is not None: if total_measures[0] > max_measures_per_cell: dropped_overflow += 1 continue cell_measures.append(cell_measure) # group per unique cell cell_count[cell_key] += 1 heka_client = get_heka_client() if dropped_malformed != 0: heka_client.incr("items.dropped.cell_ingress_malformed", count=dropped_malformed) if dropped_overflow != 0: heka_client.incr("items.dropped.cell_ingress_overflow", count=dropped_overflow) # update new/total measure counts new_cells = 0 for cell_key, count in cell_count.items(): new_cells += update_cell_measure_count( cell_key, count, utcnow, session) # update user score if userid is not None and new_cells > 0: process_score(userid, new_cells, session, key='new_cell') heka_client.incr("items.inserted.cell_measures", count=len(cell_measures)) session.add_all(cell_measures) return cell_measures
def heka_client(self): return get_heka_client()
def process_wifi_measures(session, entries, userid=None, max_measures_per_wifi=11000): wifi_measures = [] wifi_count = defaultdict(int) wifi_keys = set([e['key'] for e in entries]) utcnow = datetime.datetime.utcnow().replace(tzinfo=iso8601.UTC) # did we get measures for blacklisted wifis? blacked = session.query(WifiBlacklist.key).filter( WifiBlacklist.key.in_(wifi_keys)).all() blacked = set([b[0] for b in blacked]) space_available = {} dropped_overflow = 0 # process entries for entry in entries: wifi_key = entry['key'] # check if there's space for new measurement within per-AP maximum # note: old measures gradually expire, so this is an intake-rate limit if wifi_key not in space_available: query = session.query(Wifi.total_measures).filter( Wifi.key == wifi_key) curr = query.first() if curr is not None: space_available[wifi_key] = max_measures_per_wifi - curr[0] else: space_available[wifi_key] = max_measures_per_wifi if space_available[wifi_key] > 0: space_available[wifi_key] -= 1 else: dropped_overflow += 1 continue # convert frequency into channel numbers and remove frequency convert_frequency(entry) wifi_measures.append(create_wifi_measure(utcnow, entry)) if wifi_key not in blacked: # skip blacklisted wifi AP's wifi_count[wifi_key] += 1 heka_client = get_heka_client() if dropped_overflow != 0: heka_client.incr("items.dropped.wifi_ingress_overflow", count=dropped_overflow) # update user score if userid is not None: # do we already know about any wifis? white_keys = wifi_keys - blacked if white_keys: wifis = session.query(Wifi.key).filter(Wifi.key.in_(white_keys)) wifis = dict([(w[0], True) for w in wifis.all()]) else: wifis = {} # subtract known wifis from all unique wifis new_wifis = len(wifi_count) - len(wifis) if new_wifis > 0: process_score(userid, new_wifis, session, key='new_wifi') # update new/total measure counts for wifi_key, num in wifi_count.items(): stmt = Wifi.__table__.insert( on_duplicate='new_measures = new_measures + %s, ' 'total_measures = total_measures + %s' % (num, num) ).values( key=wifi_key, created=utcnow, new_measures=num, total_measures=num) session.execute(stmt) heka_client.incr("items.inserted.wifi_measures", count=len(wifi_measures)) session.add_all(wifi_measures) return wifi_measures
def process_cell_measures(session, entries, userid=None, max_measures_per_cell=11000): cell_count = defaultdict(int) cell_measures = [] utcnow = datetime.datetime.utcnow().replace(tzinfo=iso8601.UTC) dropped_malformed = 0 dropped_overflow = 0 space_available = {} # process entries for entry in entries: cell_measure = create_cell_measure(utcnow, entry) if not cell_measure: dropped_malformed += 1 continue cell_key = to_cellkey_psc(cell_measure) # check if there's space for new measurement within per-cell maximum # note: old measures gradually expire, so this is an intake-rate limit if cell_key not in space_available: query = session.query(Cell.total_measures).filter( Cell.radio == cell_key.radio, Cell.mcc == cell_key.mcc, Cell.mnc == cell_key.mnc, Cell.lac == cell_key.lac, Cell.cid == cell_key.cid, Cell.psc == cell_key.psc) curr = query.first() if curr is not None: space_available[cell_key] = max_measures_per_cell - curr[0] else: space_available[cell_key] = max_measures_per_cell if space_available[cell_key] > 0: space_available[cell_key] -= 1 else: dropped_overflow += 1 continue # Possibly drop measure if we're receiving them too # quickly for this cell. query = session.query(Cell.total_measures).filter( Cell.radio == cell_measure.radio, Cell.mcc == cell_measure.mcc, Cell.mnc == cell_measure.mnc, Cell.lac == cell_measure.lac, Cell.cid == cell_measure.cid, Cell.psc == cell_measure.psc) total_measures = query.first() if total_measures is not None: if total_measures[0] > max_measures_per_cell: dropped_overflow += 1 continue cell_measures.append(cell_measure) # group per unique cell cell_count[cell_key] += 1 heka_client = get_heka_client() if dropped_malformed != 0: heka_client.incr("items.dropped.cell_ingress_malformed", count=dropped_malformed) if dropped_overflow != 0: heka_client.incr("items.dropped.cell_ingress_overflow", count=dropped_overflow) # update new/total measure counts new_cells = 0 for cell_key, count in cell_count.items(): new_cells += update_cell_measure_count( cell_key, count, utcnow, session) # update user score if userid is not None and new_cells > 0: process_score(userid, new_cells, session, key='new_cell') heka_client.incr("items.inserted.cell_measures", count=len(cell_measures)) session.add_all(cell_measures) return cell_measures
def search_all_sources(session, api_name, data, client_addr=None, geoip_db=None): """ Common code-path for all lookup APIs, using WiFi, cell, cell-lac and GeoIP data sources. :param session: A database session for queries. :param api_name: A string to use in metrics (for example "geolocate"). :param data: A dict conforming to the search API. :param client_addr: The IP address the request came from. :param geoip_db: The geoip database. """ stats_client = get_stats_client() heka_client = get_heka_client() result = None result_metric = None validated = { 'wifi': [], 'cell': [], 'cell_lac': set(), 'cell_network': [], 'cell_lac_network': [], } # Pass-through wifi data validated['wifi'] = data.get('wifi', []) # Pre-process cell data radio = RADIO_TYPE.get(data.get('radio', ''), -1) for cell in data.get('cell', ()): cell = normalized_cell_dict(cell, default_radio=radio) if cell: cell_key = to_cellkey(cell) validated['cell'].append(cell_key) validated['cell_lac'].add(cell_key._replace(cid=CELLID_LAC)) # Merge all possible cell and lac keys into one list all_cell_keys = [] all_cell_keys.extend(validated['cell']) for key in validated['cell_lac']: all_cell_keys.append(key) # Do a single query for all cells and lacs at the same time try: all_networks = query_cell_networks(session, all_cell_keys) except Exception: heka_client.raven(RAVEN_ERROR) all_networks = [] for network in all_networks: if network.key == CELLID_LAC: validated['cell_lac_network'].append(network) else: validated['cell_network'].append(network) # Always do a GeoIP lookup because we at _least_ want to use the # country estimate to filter out bogus requests. We may also use # the full GeoIP City-level estimate as well, if all else fails. (geoip_res, countries) = geoip_and_best_guess_country_codes( validated['cell'], api_name, client_addr, geoip_db) # First we attempt a "zoom-in" from cell-lac, to cell # to wifi, tightening our estimate each step only so # long as it doesn't contradict the existing best-estimate # nor the possible countries of origin. for (data_field, object_field, metric_name, search_fn) in [ ('cell_lac', 'cell_lac_network', 'cell_lac', search_cell_lac), ('cell', 'cell_network', 'cell', search_cell), ('wifi', 'wifi', 'wifi', search_wifi)]: if validated[data_field]: r = None try: r = search_fn(session, validated[object_field]) except Exception: heka_client.raven(RAVEN_ERROR) stats_client.incr('%s.%s_error' % (api_name, metric_name)) if r is None: stats_client.incr('%s.no_%s_found' % (api_name, metric_name)) else: lat = float(r['lat']) lon = float(r['lon']) stats_client.incr('%s.%s_found' % (api_name, metric_name)) # Skip any hit that matches none of the possible countries. country_match = False for country in countries: if location_is_in_country(lat, lon, country, 1): country_match = True break if countries and not country_match: stats_client.incr('%s.anomaly.%s_country_mismatch' % (api_name, metric_name)) # Otherwise at least accept the first result we get. elif result is None: result = r result_metric = metric_name # Or any result that appears to be an improvement over the # existing best guess. elif (distance(float(result['lat']), float(result['lon']), lat, lon) * 1000 <= result['accuracy']): result = r result_metric = metric_name else: stats_client.incr('%s.anomaly.%s_%s_mismatch' % (api_name, metric_name, result_metric)) # Fall back to GeoIP if nothing has worked yet. We do not # include this in the "zoom-in" loop because GeoIP is # frequently _wrong_ at the city level; we only want to # accept that estimate if we got nothing better from cell # or wifi. if not result and geoip_res: result = geoip_res result_metric = 'geoip' if not result: stats_client.incr('%s.miss' % api_name) return None rounded_result = { 'lat': round(result['lat'], DEGREE_DECIMAL_PLACES), 'lon': round(result['lon'], DEGREE_DECIMAL_PLACES), 'accuracy': round(result['accuracy'], DEGREE_DECIMAL_PLACES), } stats_client.incr('%s.%s_hit' % (api_name, result_metric)) stats_client.timing('%s.accuracy.%s' % (api_name, result_metric), rounded_result['accuracy']) return rounded_result