def search_cell(session, data): radio = RADIO_TYPE.get(data['radio'], -1) cells = [] for cell in data['cell']: cell = normalized_cell_dict(cell, default_radio=radio) if not cell: continue key = to_cellkey(cell) query = session.query(Cell.lat, Cell.lon, Cell.range).filter( *join_cellkey(Cell, key)).filter( Cell.lat.isnot(None)).filter( Cell.lon.isnot(None) ) result = query.first() if result is not None: cells.append(Network(key, *result)) if not cells: return length = len(cells) avg_lat = sum([c.lat for c in cells]) / length avg_lon = sum([c.lon for c in cells]) / length return { 'lat': quantize(avg_lat), 'lon': quantize(avg_lon), 'accuracy': estimate_accuracy(avg_lat, avg_lon, cells, CELL_MIN_ACCURACY), }
def search_cell_lac(session, data): radio = RADIO_TYPE.get(data['radio'], -1) lacs = [] for cell in data['cell']: cell = normalized_cell_dict(cell, default_radio=radio) if not cell: continue cell['cid'] = CELLID_LAC key = to_cellkey(cell) query = session.query(Cell.lat, Cell.lon, Cell.range).filter( *join_cellkey(Cell, key)).filter( Cell.lat.isnot(None)).filter( Cell.lon.isnot(None) ) result = query.first() if result is not None: lacs.append(Network(key, *result)) if not lacs: return # take the smallest LAC of any the user is inside lac = sorted(lacs, key=operator.attrgetter('range'))[0] return { 'lat': quantize(lac.lat), 'lon': quantize(lac.lon), 'accuracy': max(LAC_MIN_ACCURACY, lac.range), }
def mark_moving_cells(session, moving_cells): moving_keys = [] blacklist = set() for cell in moving_cells: query = session.query(CellBlacklist).filter( *join_cellkey(CellBlacklist, cell)) b = query.first() if b is None: key = to_cellkey(cell)._asdict() blacklist.add(CellBlacklist(**key)) moving_keys.append(key) get_heka_client().incr("items.blacklisted.cell_moving", len(moving_keys)) session.add_all(blacklist) remove_cell.delay(moving_keys)
def remove_cell(self, cell_keys): cells_removed = 0 try: with self.db_session() as session: for k in cell_keys: key = to_cellkey(k) query = session.query(Cell).filter(*join_cellkey(Cell, key)) cells_removed += query.delete(synchronize_session=False) session.commit() return cells_removed except IntegrityError as exc: # pragma: no cover self.heka_client.raven('error') return 0 except Exception as exc: # pragma: no cover raise self.retry(exc=exc)
def remove_cell(self, cell_keys): cells_removed = 0 try: with self.db_session() as session: changed_lacs = set() for k in cell_keys: key = to_cellkey(k) query = session.query(Cell).filter(*join_cellkey(Cell, key)) cells_removed += query.delete() changed_lacs.add(key._replace(cid=CELLID_LAC)) for key in changed_lacs: # Either schedule an update to the enclosing LAC or, if # we just removed the last cell in the LAC, remove the LAC # entirely. query = session.query(Cell).filter( Cell.radio == key.radio, Cell.mcc == key.mcc, Cell.mnc == key.mnc, Cell.lac == key.lac, Cell.cid != CELLID_LAC) n = query.count() query = session.query(Cell).filter( Cell.radio == key.radio, Cell.mcc == key.mcc, Cell.mnc == key.mnc, Cell.lac == key.lac, Cell.cid == CELLID_LAC) if n < 1: query.delete() else: lac = query.first() if lac is not None: lac.new_measures += 1 session.commit() return cells_removed except Exception as exc: # pragma: no cover self.heka_client.raven('error') raise self.retry(exc=exc)
def remove_cell(self, cell_keys): cells_removed = 0 try: with self.db_session() as session: for k in cell_keys: key = to_cellkey(k) query = session.query(Cell).filter(*join_cellkey(Cell, key)) cells_removed += query.delete() # Either schedule an update to the enclosing LAC or, if # we just removed the last cell in the LAC, remove the LAC # entirely. query = session.query(func.count(Cell.id)).filter( Cell.radio == key.radio, Cell.mcc == key.mcc, Cell.mnc == key.mnc, Cell.lac == key.lac, Cell.cid != CELLID_LAC) c = query.first() assert c is not None n = int(c[0]) query = session.query(Cell).filter( Cell.radio == key.radio, Cell.mcc == key.mcc, Cell.mnc == key.mnc, Cell.lac == key.lac, Cell.cid == CELLID_LAC) if n < 1: query.delete() else: query.update({'new_measures': '1'}) session.commit() return cells_removed except IntegrityError as exc: # pragma: no cover self.heka_client.raven('error') return 0 except Exception as exc: # pragma: no cover raise self.retry(exc=exc)
def remove_cell(self, cell_keys): try: cells_removed = 0 redis_client = self.app.redis_client with self.db_session() as session: changed_lacs = set() for k in cell_keys: key = to_cellkey(k) query = session.query(Cell).filter(*join_cellkey(Cell, key)) cells_removed += query.delete() changed_lacs.add(key._replace(cid=CELLID_LAC)) if changed_lacs: session.on_post_commit(enqueue_lacs, redis_client, changed_lacs) session.commit() return cells_removed except Exception as exc: # pragma: no cover self.heka_client.raven('error') raise self.retry(exc=exc)
def remove_cell(self, cell_keys): cells_removed = 0 try: with self.db_session() as session: for k in cell_keys: key = to_cellkey(k) query = session.query(Cell).filter(*join_cellkey(Cell, key)) cells_removed += query.delete() # Either schedule an update to the enclosing LAC or, if # we just removed the last cell in the LAC, remove the LAC # entirely. query = session.query(func.count(Cell.id)).filter( Cell.radio == key.radio, Cell.mcc == key.mcc, Cell.mnc == key.mnc, Cell.lac == key.lac, Cell.cid != CELLID_LAC) c = query.first() assert c is not None n = int(c[0]) query = session.query(Cell).filter(Cell.radio == key.radio, Cell.mcc == key.mcc, Cell.mnc == key.mnc, Cell.lac == key.lac, Cell.cid == CELLID_LAC) if n < 1: query.delete() else: query.update({'new_measures': '1'}) session.commit() return cells_removed except IntegrityError as exc: # pragma: no cover self.heka_client.raven('error') return 0 except Exception as exc: # pragma: no cover raise self.retry(exc=exc)
def test_blacklist_moving_cells(self): now = util.utcnow() long_ago = now - timedelta(days=40) session = self.db_master_session k1 = dict(radio=1, mcc=1, mnc=2, lac=3, cid=4) k2 = dict(radio=1, mcc=1, mnc=2, lac=6, cid=8) k3 = dict(radio=1, mcc=1, mnc=2, lac=9, cid=12) k4 = dict(radio=1, mcc=1, mnc=2, lac=12, cid=16) k5 = dict(radio=1, mcc=1, mnc=2, lac=15, cid=20) k6 = dict(radio=1, mcc=1, mnc=2, lac=18, cid=24) keys = set([CellKey(**k) for k in [k1, k2, k3, k4, k5, k6]]) # keys k2, k3 and k4 are expected to be detected as moving data = [ # a cell with an entry but no prior position Cell(new_measures=3, total_measures=0, **k1), CellMeasure(lat=1.001, lon=1.001, **k1), CellMeasure(lat=1.002, lon=1.005, **k1), CellMeasure(lat=1.003, lon=1.009, **k1), # a cell with a prior known position Cell(lat=2.0, lon=2.0, new_measures=2, total_measures=1, **k2), CellMeasure(lat=2.0, lon=2.0, **k2), CellMeasure(lat=4.0, lon=2.0, **k2), # a cell with a very different prior position Cell(lat=1.0, lon=1.0, new_measures=2, total_measures=1, **k3), CellMeasure(lat=3.0, lon=3.0, **k3), CellMeasure(lat=-3.0, lon=3.0, **k3), # another cell with a prior known position (and negative lat) Cell(lat=-4.0, lon=4.0, new_measures=2, total_measures=1, **k4), CellMeasure(lat=-4.0, lon=4.0, **k4), CellMeasure(lat=-6.0, lon=4.0, **k4), # an already blacklisted cell CellBlacklist(**k5), CellMeasure(lat=5.0, lon=5.0, **k5), CellMeasure(lat=8.0, lon=5.0, **k5), # a cell with an old different record we ignore, position # estimate has been updated since Cell(lat=6.0, lon=6.0, new_measures=2, total_measures=1, **k6), CellMeasure(lat=6.9, lon=6.9, time=long_ago, **k6), CellMeasure(lat=6.0, lon=6.0, **k6), CellMeasure(lat=6.001, lon=6, **k6), ] session.add_all(data) session.commit() result = location_update_cell.delay(min_new=1) self.assertEqual(result.get(), (5, 3)) black = session.query(CellBlacklist).all() self.assertEqual(set([to_cellkey(b) for b in black]), set([CellKey(**k) for k in [k2, k3, k4, k5]])) measures = session.query(CellMeasure).all() self.assertEqual(len(measures), 14) self.assertEqual(set([to_cellkey(m) for m in measures]), keys) # test duplicate call result = location_update_cell.delay(min_new=1) self.assertEqual(result.get(), 0) self.check_stats( total=6, timer=[ # We made duplicate calls ('task.data.location_update_cell', 2), # One of those would've scheduled a remove_cell task ('task.data.remove_cell', 1) ], gauge=[ ('task.data.location_update_cell.new_measures_1_100', 2), ])
def search_all_sources(session, api_name, data, client_addr=None, geoip_db=None, api_key_log=False, api_key_name=None, result_type='position'): """ Common code-path for all lookup APIs, using WiFi, cell, cell-lac and GeoIP data sources. :param session: A database session for queries. :param api_name: A string to use in metrics (for example "geolocate"). :param data: A dict conforming to the search API. :param client_addr: The IP address the request came from. :param geoip_db: The geoip database. :param api_key_log: Enable additional api key specific logging? :param api_key_name: The metric friendly api key name. :param result_type: What kind of result to return, either a lat/lon position or a country estimate. """ if result_type not in ('country', 'position'): raise ValueError('Invalid result_type, must be one of ' 'position or country') stats_client = get_stats_client() heka_client = get_heka_client() result = None result_metric = None validated = { 'wifi': [], 'cell': [], 'cell_lac': set(), 'cell_network': [], 'cell_lac_network': [], } # Pre-process wifi data for wifi in data.get('wifi', ()): wifi = normalized_wifi_dict(wifi) if wifi: validated['wifi'].append(wifi) # Pre-process cell data radio = RADIO_TYPE.get(data.get('radio', ''), -1) for cell in data.get('cell', ()): cell = normalized_cell_dict(cell, default_radio=radio) if cell: cell_key = to_cellkey(cell) validated['cell'].append(cell_key) validated['cell_lac'].add(cell_key) found_cells = [] # Query all cells and OCID cells for model in Cell, OCIDCell, CellArea: cell_filter = [] for key in validated['cell']: # create a list of 'and' criteria for cell keys criterion = join_cellkey(model, key) cell_filter.append(and_(*criterion)) if cell_filter: # only do a query if we have cell results, or this will match # all rows in the table load_fields = ('radio', 'mcc', 'mnc', 'lac', 'lat', 'lon', 'range') query = (session.query(model).options( load_only(*load_fields)).filter(or_(*cell_filter)).filter( model.lat.isnot(None)).filter(model.lon.isnot(None))) try: found_cells.extend(query.all()) except Exception: heka_client.raven(RAVEN_ERROR) if found_cells: # Group all found_cellss by location area lacs = defaultdict(list) for cell in found_cells: cellarea_key = (cell.radio, cell.mcc, cell.mnc, cell.lac) lacs[cellarea_key].append(cell) def sort_lac(v): # use the lac with the most values, # or the one with the smallest range return (len(v), -min([e.range for e in v])) # If we get data from multiple location areas, use the one with the # most data points in it. That way a lac with a cell hit will # have two entries and win over a lac with only the lac entry. lac = sorted(lacs.values(), key=sort_lac, reverse=True) for cell in lac[0]: # The first entry is the key, # used only to distinguish cell from lac network = Network(key=None, lat=cell.lat, lon=cell.lon, range=cell.range) if type(cell) is CellArea: validated['cell_lac_network'].append(network) else: validated['cell_network'].append(network) # Always do a GeoIP lookup because it is cheap and we want to # report geoip vs. other data mismatches. We may also use # the full GeoIP City-level estimate as well, if all else fails. (geoip_res, countries) = geoip_and_best_guess_country_codes(validated['cell'], api_name, client_addr, geoip_db, stats_client) # First we attempt a "zoom-in" from cell-lac, to cell # to wifi, tightening our estimate each step only so # long as it doesn't contradict the existing best-estimate # nor the possible countries of origin. for (data_field, object_field, metric_name, search_fn) in [ ('cell_lac', 'cell_lac_network', 'cell_lac', search_cell_lac), ('cell', 'cell_network', 'cell', search_cell), ('wifi', 'wifi', 'wifi', search_wifi) ]: if validated[data_field]: r = None try: r = search_fn(session, validated[object_field], stats_client, api_name) except Exception: heka_client.raven(RAVEN_ERROR) stats_client.incr('%s.%s_error' % (api_name, metric_name)) if r is None: stats_client.incr('%s.no_%s_found' % (api_name, metric_name)) else: lat = float(r['lat']) lon = float(r['lon']) stats_client.incr('%s.%s_found' % (api_name, metric_name)) # Skip any hit that matches none of the possible countries. country_match = False for country in countries: if location_is_in_country(lat, lon, country, 1): country_match = True break if countries and not country_match: stats_client.incr('%s.anomaly.%s_country_mismatch' % (api_name, metric_name)) # Always accept the first result we get. if result is None: result = r result_metric = metric_name # Or any result that appears to be an improvement over the # existing best guess. elif (distance(float(result['lat']), float(result['lon']), lat, lon) * 1000 <= result['accuracy']): result = r result_metric = metric_name else: stats_client.incr('%s.anomaly.%s_%s_mismatch' % (api_name, metric_name, result_metric)) # Fall back to GeoIP if nothing has worked yet. We do not # include this in the "zoom-in" loop because GeoIP is # frequently _wrong_ at the city level; we only want to # accept that estimate if we got nothing better from cell # or wifi. if not result and geoip_res: result = geoip_res result_metric = 'geoip' # Do detailed logging for some api keys if api_key_log and api_key_name: api_log_metric = None wifi_keys = set([w['key'] for w in validated['wifi']]) if wifi_keys and \ len(filter_bssids_by_similarity(wifi_keys)) >= MIN_WIFIS_IN_QUERY: # Only count requests as WiFi-based if they contain enough # distinct WiFi networks to pass our filters if result_metric == 'wifi': api_log_metric = 'wifi_hit' else: api_log_metric = 'wifi_miss' elif validated['cell']: if result_metric == 'cell': api_log_metric = 'cell_hit' elif result_metric == 'cell_lac': api_log_metric = 'cell_lac_hit' else: api_log_metric = 'cell_miss' else: if geoip_res: api_log_metric = 'geoip_hit' else: api_log_metric = 'geoip_miss' if api_log_metric: stats_client.incr('%s.api_log.%s.%s' % (api_name, api_key_name, api_log_metric)) if not result: stats_client.incr('%s.miss' % api_name) return None stats_client.incr('%s.%s_hit' % (api_name, result_metric)) if result_type == 'position': rounded_result = { 'lat': round(result['lat'], DEGREE_DECIMAL_PLACES), 'lon': round(result['lon'], DEGREE_DECIMAL_PLACES), 'accuracy': round(result['accuracy'], DEGREE_DECIMAL_PLACES), } stats_client.timing('%s.accuracy.%s' % (api_name, result_metric), rounded_result['accuracy']) return rounded_result elif result_type == 'country': if countries: country = iso3166.countries.get(countries[0]) return { 'country_name': country.name, 'country_code': country.alpha2 }
def search_all_sources(session, api_name, data, client_addr=None, geoip_db=None): """ Common code-path for all lookup APIs, using WiFi, cell, cell-lac and GeoIP data sources. :param session: A database session for queries. :param api_name: A string to use in metrics (for example "geolocate"). :param data: A dict conforming to the search API. :param client_addr: The IP address the request came from. :param geoip_db: The geoip database. """ stats_client = get_stats_client() heka_client = get_heka_client() result = None result_metric = None validated = { 'wifi': [], 'cell': [], 'cell_lac': set(), 'cell_network': [], 'cell_lac_network': [], } # Pass-through wifi data validated['wifi'] = data.get('wifi', []) # Pre-process cell data radio = RADIO_TYPE.get(data.get('radio', ''), -1) for cell in data.get('cell', ()): cell = normalized_cell_dict(cell, default_radio=radio) if cell: cell_key = to_cellkey(cell) validated['cell'].append(cell_key) validated['cell_lac'].add(cell_key._replace(cid=CELLID_LAC)) # Merge all possible cell and lac keys into one list all_cell_keys = [] all_cell_keys.extend(validated['cell']) for key in validated['cell_lac']: all_cell_keys.append(key) # Do a single query for all cells and lacs at the same time try: all_networks = query_cell_networks(session, all_cell_keys) except Exception: heka_client.raven(RAVEN_ERROR) all_networks = [] for network in all_networks: if network.key == CELLID_LAC: validated['cell_lac_network'].append(network) else: validated['cell_network'].append(network) # Always do a GeoIP lookup because it is cheap and we want to # report geoip vs. other data mismatches. We may also use # the full GeoIP City-level estimate as well, if all else fails. (geoip_res, countries) = geoip_and_best_guess_country_codes( validated['cell'], api_name, client_addr, geoip_db) # First we attempt a "zoom-in" from cell-lac, to cell # to wifi, tightening our estimate each step only so # long as it doesn't contradict the existing best-estimate # nor the possible countries of origin. for (data_field, object_field, metric_name, search_fn) in [ ('cell_lac', 'cell_lac_network', 'cell_lac', search_cell_lac), ('cell', 'cell_network', 'cell', search_cell), ('wifi', 'wifi', 'wifi', search_wifi)]: if validated[data_field]: r = None try: r = search_fn(session, validated[object_field]) except Exception: heka_client.raven(RAVEN_ERROR) stats_client.incr('%s.%s_error' % (api_name, metric_name)) if r is None: stats_client.incr('%s.no_%s_found' % (api_name, metric_name)) else: lat = float(r['lat']) lon = float(r['lon']) stats_client.incr('%s.%s_found' % (api_name, metric_name)) # Skip any hit that matches none of the possible countries. country_match = False for country in countries: if location_is_in_country(lat, lon, country, 1): country_match = True break if countries and not country_match: stats_client.incr('%s.anomaly.%s_country_mismatch' % (api_name, metric_name)) # Always accept the first result we get. if result is None: result = r result_metric = metric_name # Or any result that appears to be an improvement over the # existing best guess. elif (distance(float(result['lat']), float(result['lon']), lat, lon) * 1000 <= result['accuracy']): result = r result_metric = metric_name else: stats_client.incr('%s.anomaly.%s_%s_mismatch' % (api_name, metric_name, result_metric)) # Fall back to GeoIP if nothing has worked yet. We do not # include this in the "zoom-in" loop because GeoIP is # frequently _wrong_ at the city level; we only want to # accept that estimate if we got nothing better from cell # or wifi. if not result and geoip_res: result = geoip_res result_metric = 'geoip' if not result: stats_client.incr('%s.miss' % api_name) return None rounded_result = { 'lat': round(result['lat'], DEGREE_DECIMAL_PLACES), 'lon': round(result['lon'], DEGREE_DECIMAL_PLACES), 'accuracy': round(result['accuracy'], DEGREE_DECIMAL_PLACES), } stats_client.incr('%s.%s_hit' % (api_name, result_metric)) stats_client.timing('%s.accuracy.%s' % (api_name, result_metric), rounded_result['accuracy']) return rounded_result
def search_all_sources(session, api_name, data, client_addr=None, geoip_db=None, api_key_log=False, api_key_name=None): """ Common code-path for all lookup APIs, using WiFi, cell, cell-lac and GeoIP data sources. :param session: A database session for queries. :param api_name: A string to use in metrics (for example "geolocate"). :param data: A dict conforming to the search API. :param client_addr: The IP address the request came from. :param geoip_db: The geoip database. """ stats_client = get_stats_client() heka_client = get_heka_client() result = None result_metric = None validated = { 'wifi': [], 'cell': [], 'cell_lac': set(), 'cell_network': [], 'cell_lac_network': [], } # Pre-process wifi data for wifi in data.get('wifi', ()): wifi = normalized_wifi_dict(wifi) if wifi: validated['wifi'].append(wifi) # Pre-process cell data radio = RADIO_TYPE.get(data.get('radio', ''), -1) for cell in data.get('cell', ()): cell = normalized_cell_dict(cell, default_radio=radio) if cell: cell_key = to_cellkey(cell) validated['cell'].append(cell_key) validated['cell_lac'].add(cell_key._replace(cid=CELLID_LAC)) # Merge all possible cell and lac keys into one list all_cell_keys = [] all_cell_keys.extend(validated['cell']) for key in validated['cell_lac']: all_cell_keys.append(key) # Do a single query for all cells and lacs at the same time try: all_networks = query_cell_networks(session, all_cell_keys) except Exception: heka_client.raven(RAVEN_ERROR) all_networks = [] for network in all_networks: if network.key == CELLID_LAC: validated['cell_lac_network'].append(network) else: validated['cell_network'].append(network) # Always do a GeoIP lookup because it is cheap and we want to # report geoip vs. other data mismatches. We may also use # the full GeoIP City-level estimate as well, if all else fails. (geoip_res, countries) = geoip_and_best_guess_country_codes(validated['cell'], api_name, client_addr, geoip_db) # First we attempt a "zoom-in" from cell-lac, to cell # to wifi, tightening our estimate each step only so # long as it doesn't contradict the existing best-estimate # nor the possible countries of origin. for (data_field, object_field, metric_name, search_fn) in [ ('cell_lac', 'cell_lac_network', 'cell_lac', search_cell_lac), ('cell', 'cell_network', 'cell', search_cell), ('wifi', 'wifi', 'wifi', search_wifi) ]: if validated[data_field]: r = None try: r = search_fn(session, validated[object_field]) except Exception: heka_client.raven(RAVEN_ERROR) stats_client.incr('%s.%s_error' % (api_name, metric_name)) if r is None: stats_client.incr('%s.no_%s_found' % (api_name, metric_name)) else: lat = float(r['lat']) lon = float(r['lon']) stats_client.incr('%s.%s_found' % (api_name, metric_name)) # Skip any hit that matches none of the possible countries. country_match = False for country in countries: if location_is_in_country(lat, lon, country, 1): country_match = True break if countries and not country_match: stats_client.incr('%s.anomaly.%s_country_mismatch' % (api_name, metric_name)) # Always accept the first result we get. if result is None: result = r result_metric = metric_name # Or any result that appears to be an improvement over the # existing best guess. elif (distance(float(result['lat']), float(result['lon']), lat, lon) * 1000 <= result['accuracy']): result = r result_metric = metric_name else: stats_client.incr('%s.anomaly.%s_%s_mismatch' % (api_name, metric_name, result_metric)) # Fall back to GeoIP if nothing has worked yet. We do not # include this in the "zoom-in" loop because GeoIP is # frequently _wrong_ at the city level; we only want to # accept that estimate if we got nothing better from cell # or wifi. if not result and geoip_res: result = geoip_res result_metric = 'geoip' # Do detailed logging for some api keys if api_key_log and api_key_name: api_log_metric = None wifi_keys = set([w['key'] for w in validated['wifi']]) if wifi_keys and \ len(filter_bssids_by_similarity(wifi_keys)) >= MIN_WIFIS_IN_QUERY: # Only count requests as WiFi-based if they contain enough # distinct WiFi networks to pass our filters if result_metric == 'wifi': api_log_metric = 'wifi_hit' else: api_log_metric = 'wifi_miss' elif validated['cell']: if result_metric == 'cell': api_log_metric = 'cell_hit' elif result_metric == 'cell_lac': api_log_metric = 'cell_lac_hit' else: api_log_metric = 'cell_miss' else: if geoip_res: api_log_metric = 'geoip_hit' else: api_log_metric = 'geoip_miss' if api_log_metric: stats_client.incr('%s.api_log.%s.%s' % (api_name, api_key_name, api_log_metric)) if not result: stats_client.incr('%s.miss' % api_name) return None rounded_result = { 'lat': round(result['lat'], DEGREE_DECIMAL_PLACES), 'lon': round(result['lon'], DEGREE_DECIMAL_PLACES), 'accuracy': round(result['accuracy'], DEGREE_DECIMAL_PLACES), } stats_client.incr('%s.%s_hit' % (api_name, result_metric)) stats_client.timing('%s.accuracy.%s' % (api_name, result_metric), rounded_result['accuracy']) return rounded_result