def process_mapstat(measures, session, userid=None): tiles = defaultdict(int) # aggregate to 100x100m tiles for measure in measures: tiles[(measure.lat / 10000, measure.lon / 10000)] += 1 lats = set([k[0] for k in tiles.keys()]) lons = set([k[1] for k in tiles.keys()]) result = session.query(MapStat).filter( MapStat.key == MAPSTAT_TYPE['location']).filter( MapStat.lat.in_(lats)).filter( MapStat.lon.in_(lons)).all() prior = {} for r in result: prior[(r.lat, r.lon)] = r tile_count = 0 for (lat, lon), value in tiles.items(): old = prior.get((lat, lon), None) if old: old.value = MapStat.value + value else: tile_count += 1 stmt = MapStat.__table__.insert( on_duplicate='value = value + %s' % int(value)).values( lat=lat, lon=lon, key=MAPSTAT_TYPE['location'], value=value) session.execute(stmt) if userid is not None and tile_count > 0: process_score(userid, tile_count, session, key='new_location')
def process_cell_measure(session, measure_data, entries, userid=None): cell_count = defaultdict(int) cell_measures = [] created = decode_datetime(measure_data.get('created', '')) # process entries for entry in entries: cell_measure = create_cell_measure(measure_data, entry) # use more specific cell type or # fall back to less precise measure if entry.get('radio'): cell_measure.radio = RADIO_TYPE.get(entry['radio'], -1) else: cell_measure.radio = measure_data['radio'] cell_measures.append(cell_measure) # group per unique cell cell_count[CellKey(cell_measure.radio, cell_measure.mcc, cell_measure.mnc, cell_measure.lac, cell_measure.cid)] += 1 # update new/total measure counts new_cells = 0 for cell_key, count in cell_count.items(): new_cells += update_cell_measure_count( cell_key, count, created, session) # update user score if userid is not None and new_cells > 0: process_score(userid, new_cells, session, key='new_cell') session.add_all(cell_measures) return cell_measures
def process_mapstat(measures, session, userid=None): tiles = defaultdict(int) # aggregate to 100x100m tiles for measure in measures: tiles[(measure.lat / 10000, measure.lon / 10000)] += 1 # TODO: on duplicate key update lats = set([k[0] for k in tiles.keys()]) lons = set([k[1] for k in tiles.keys()]) result = session.query(MapStat).filter( MapStat.key == MAPSTAT_TYPE['location']).filter( MapStat.lat.in_(lats)).filter( MapStat.lon.in_(lons)).all() prior = {} for r in result: prior[(r.lat, r.lon)] = r tile_count = 0 for (lat, lon), value in tiles.items(): stat = MapStat(lat=lat, lon=lon, value=value) old = prior.get((lat, lon), None) if old: if old.value < 2: # give points for the first two tile hits tile_count += 1 old.value = MapStat.value + value else: tile_count += 1 session.add(stat) if userid is not None and tile_count > 0: process_score(userid, tile_count, session, key='new_location')
def update_cell_measure_count(measure, session, userid=None): if (measure.radio == -1 or measure.lac == 0 or measure.cid == 0): # only update data for complete records return # do we already know about these cells? query = session.query(Cell).filter( Cell.radio == measure.radio).filter( Cell.mcc == measure.mcc).filter( Cell.mnc == measure.mnc).filter( Cell.lac == measure.lac).filter( Cell.cid == measure.cid ) cell = query.first() new_cell = 0 if cell is None: new_cell += 1 stmt = Cell.__table__.insert( on_duplicate='new_measures = new_measures + 1, ' 'total_measures = total_measures + 1').values( created=measure.created, radio=measure.radio, mcc=measure.mcc, mnc=measure.mnc, lac=measure.lac, cid=measure.cid, new_measures=1, total_measures=1) session.execute(stmt) if userid is not None and new_cell > 0: # update user score process_score(userid, new_cell, session, key='new_cell')
def submit_post(request): session = request.db_master_session session_objects = [] nickname = request.headers.get('X-Nickname', '') userid, nickname = process_user(nickname, session) utcnow = datetime.datetime.utcnow().replace(tzinfo=iso8601.UTC) utcmin = utcnow - datetime.timedelta(60) points = 0 measures = [] for item in request.validated['items']: item = process_time(item, utcnow, utcmin) measure = process_measure(item, utcnow, session, userid=userid) measures.append(measure) points += 1 if userid is not None: process_score(userid, points, session) if measures: process_mapstat(measures, session, userid=userid) session.add_all(session_objects) session.commit() return HTTPNoContent()
def update_wifi_measure_count(wifi_key, wifis, session, userid=None): # side-effect, modifies wifis new_wifi = 0 if wifi_key in wifis: wifi = wifis[wifi_key] if isinstance(wifi.new_measures, (int, long)): wifi.new_measures = Wifi.new_measures + 1 else: # already a sql expression wifi.new_measures += 1 if isinstance(wifi.total_measures, (int, long)): if wifi.total_measures < 5: # count wifis as new until they show up in the search new_wifi += 1 wifi.total_measures = Wifi.total_measures + 1 else: # already a sql expression wifi.total_measures += 1 else: wifis[wifi_key] = wifi = Wifi( key=wifi_key, new_measures=1, total_measures=1) new_wifi += 1 session.add(wifi) if userid is not None and new_wifi > 0: # update user score process_score(userid, new_wifi, session, key='new_wifi')
def process_measures(items, session, userid=None): utcnow = datetime.datetime.utcnow().replace(tzinfo=iso8601.UTC) utcmin = utcnow - datetime.timedelta(60) # get enough auto-increment ids assigned measures = [] for i in range(len(items)): measure = Measure() measures.append(measure) session.add(measure) # TODO switch unique measure id to a uuid, so we don't have to do # get these from a savepoint here session.flush() positions = [] cell_measures = [] wifi_measures = [] for i, item in enumerate(items): item = process_time(item, utcnow, utcmin) cell, wifi = process_measure(measures[i].id, item, session) cell_measures.extend(cell) wifi_measures.extend(wifi) positions.append({ 'lat': to_precise_int(item['lat']), 'lon': to_precise_int(item['lon']), }) heka_client = get_heka_client() if cell_measures: # group by and create task per cell key heka_client.incr("items.uploaded.cell_measures", len(cell_measures)) cells = defaultdict(list) for measure in cell_measures: cell_key = CellKey(measure['radio'], measure['mcc'], measure['mnc'], measure['lac'], measure['cid'], measure['psc']) cells[cell_key].append(measure) for values in cells.values(): insert_cell_measures.delay(values, userid=userid) if wifi_measures: # group by and create task per wifi key heka_client.incr("items.uploaded.wifi_measures", len(wifi_measures)) wifis = defaultdict(list) for measure in wifi_measures: wifis[measure['key']].append(measure) for values in wifis.values(): insert_wifi_measures.delay(values, userid=userid) if userid is not None: process_score(userid, len(items), session) if positions: process_mapstat(positions, session, userid=userid)
def process_mapstat(positions, session, userid=None): # 10x10 meter tiles tile_count = process_mapstat_keyed( 1000, MAPSTAT_TYPE['location'], positions, session) if userid is not None and tile_count > 0: process_score(userid, tile_count, session, key='new_location') # 100x100 m tiles process_mapstat_keyed( 10000, MAPSTAT_TYPE['location_100m'], positions, session)
def process_measures(items, session, userid=None): utcnow = datetime.datetime.utcnow().replace(tzinfo=iso8601.UTC) utcmin = utcnow - datetime.timedelta(60) # get enough auto-increment ids assigned measures = [] for i in range(len(items)): measure = Measure() measures.append(measure) session.add(measure) # TODO switch unique measure id to a uuid, so we don't have to do # get these from a savepoint here session.flush() positions = [] cell_measures = [] wifi_measures = [] for i, item in enumerate(items): item = process_time(item, utcnow, utcmin) cell, wifi = process_measure(measures[i].id, item, session) cell_measures.extend(cell) wifi_measures.extend(wifi) positions.append({ 'lat': to_precise_int(item['lat']), 'lon': to_precise_int(item['lon']), }) heka_client = get_heka_client() if cell_measures: # group by and create task per cell key heka_client.incr("items.uploaded.cell_measures", len(cell_measures)) cells = defaultdict(list) for measure in cell_measures: cells[to_cellkey_psc(measure)].append(measure) for values in cells.values(): insert_cell_measures.delay(values, userid=userid) if wifi_measures: # group by and create task per wifi key heka_client.incr("items.uploaded.wifi_measures", len(wifi_measures)) wifis = defaultdict(list) for measure in wifi_measures: wifis[measure['key']].append(measure) for values in wifis.values(): insert_wifi_measures.delay(values, userid=userid) if userid is not None: process_score(userid, len(items), session) if positions: process_mapstat(positions, session, userid=userid)
def update_wifi_measure_count(wifi_key, wifis, created, session, userid=None): new_wifi = 0 if wifi_key not in wifis: new_wifi += 1 wifis[wifi_key] = True stmt = Wifi.__table__.insert( on_duplicate='new_measures = new_measures + 1, ' 'total_measures = total_measures + 1').values( key=wifi_key, created=created, new_measures=1, total_measures=1) session.execute(stmt) if userid is not None and new_wifi > 0: # update user score process_score(userid, new_wifi, session, key='new_wifi')
def process_wifi_measure(session, measure_data, entries, userid=None): wifi_measures = [] wifi_count = defaultdict(int) wifi_keys = set([e['key'] for e in entries]) created = decode_datetime(measure_data.get('created', '')) # did we get measures for blacklisted wifis? blacked = session.query(WifiBlacklist.key).filter( WifiBlacklist.key.in_(wifi_keys)).all() blacked = set([b[0] for b in blacked]) # process entries for entry in entries: wifi_key = entry['key'] # convert frequency into channel numbers and remove frequency convert_frequency(entry) wifi_measures.append(create_wifi_measure(measure_data, created, entry)) if wifi_key not in blacked: # skip blacklisted wifi AP's wifi_count[wifi_key] += 1 # update user score if userid is not None: # do we already know about any wifis? white_keys = wifi_keys - blacked if white_keys: wifis = session.query(Wifi.key).filter(Wifi.key.in_(white_keys)) wifis = dict([(w[0], True) for w in wifis.all()]) else: wifis = {} # subtract known wifis from all unique wifis new_wifis = len(wifi_count) - len(wifis) if new_wifis > 0: process_score(userid, new_wifis, session, key='new_wifi') # update new/total measure counts for wifi_key, num in wifi_count.items(): stmt = Wifi.__table__.insert( on_duplicate='new_measures = new_measures + %s, ' 'total_measures = total_measures + %s' % (num, num) ).values( key=wifi_key, created=created, new_measures=num, total_measures=num) session.execute(stmt) session.add_all(wifi_measures) return wifi_measures
def update_cell_measure_count(measure, session, userid=None): if (measure.radio == -1 or measure.lac == 0 or measure.cid == 0): # only update data for complete records return # do we already know about these cells? query = session.query(Cell).filter( Cell.radio == measure.radio).filter( Cell.mcc == measure.mcc).filter( Cell.mnc == measure.mnc).filter( Cell.lac == measure.lac).filter( Cell.cid == measure.cid ) cell = query.first() new_cell = 0 if cell: if isinstance(cell.new_measures, (int, long)): cell.new_measures = Cell.new_measures + 1 else: # already a sql expression cell.new_measures += 1 if isinstance(cell.total_measures, (int, long)): if cell.total_measures < 5: # count cells as new until they show up in the search new_cell += 1 cell.total_measures = Cell.total_measures + 1 else: # already a sql expression cell.total_measures += 1 if cell.total_measures.right.value < 5: # count cells as new until they show up in the search new_cell += 1 else: cell = Cell(radio=measure.radio, mcc=measure.mcc, mnc=measure.mnc, lac=measure.lac, cid=measure.cid, new_measures=1, total_measures=1) new_cell += 1 session.add(cell) if userid is not None and new_cell > 0: # update user score process_score(userid, new_cell, session, key='new_cell')
def process_wifi_measures(session, entries, userid=None, max_measures_per_wifi=11000): wifi_measures = [] wifi_count = defaultdict(int) wifi_keys = set([e['key'] for e in entries]) utcnow = datetime.datetime.utcnow().replace(tzinfo=iso8601.UTC) # did we get measures for blacklisted wifis? blacked = session.query(WifiBlacklist.key).filter( WifiBlacklist.key.in_(wifi_keys)).all() blacked = set([b[0] for b in blacked]) space_available = {} dropped_overflow = 0 # process entries for entry in entries: wifi_key = entry['key'] # check if there's space for new measurement within per-AP maximum # note: old measures gradually expire, so this is an intake-rate limit if wifi_key not in space_available: query = session.query(Wifi.total_measures).filter( Wifi.key == wifi_key) curr = query.first() if curr is not None: space_available[wifi_key] = max_measures_per_wifi - curr[0] else: space_available[wifi_key] = max_measures_per_wifi if space_available[wifi_key] > 0: space_available[wifi_key] -= 1 else: dropped_overflow += 1 continue # convert frequency into channel numbers and remove frequency convert_frequency(entry) wifi_measures.append(create_wifi_measure(utcnow, entry)) if wifi_key not in blacked: # skip blacklisted wifi AP's wifi_count[wifi_key] += 1 heka_client = get_heka_client() if dropped_overflow != 0: heka_client.incr("items.dropped.wifi_ingress_overflow", count=dropped_overflow) # update user score if userid is not None: # do we already know about any wifis? white_keys = wifi_keys - blacked if white_keys: wifis = session.query(Wifi.key).filter(Wifi.key.in_(white_keys)) wifis = dict([(w[0], True) for w in wifis.all()]) else: wifis = {} # subtract known wifis from all unique wifis new_wifis = len(wifi_count) - len(wifis) if new_wifis > 0: process_score(userid, new_wifis, session, key='new_wifi') # update new/total measure counts for wifi_key, num in wifi_count.items(): stmt = Wifi.__table__.insert( on_duplicate='new_measures = new_measures + %s, ' 'total_measures = total_measures + %s' % (num, num) ).values( key=wifi_key, created=utcnow, new_measures=num, total_measures=num) session.execute(stmt) heka_client.incr("items.inserted.wifi_measures", count=len(wifi_measures)) session.add_all(wifi_measures) return wifi_measures
def process_cell_measures(session, entries, userid=None, max_measures_per_cell=11000): cell_count = defaultdict(int) cell_measures = [] utcnow = datetime.datetime.utcnow().replace(tzinfo=iso8601.UTC) dropped_malformed = 0 dropped_overflow = 0 space_available = {} # process entries for entry in entries: cell_measure = create_cell_measure(utcnow, entry) if not cell_measure: dropped_malformed += 1 continue cell_key = to_cellkey_psc(cell_measure) # check if there's space for new measurement within per-cell maximum # note: old measures gradually expire, so this is an intake-rate limit if cell_key not in space_available: query = session.query(Cell.total_measures).filter( Cell.radio == cell_key.radio, Cell.mcc == cell_key.mcc, Cell.mnc == cell_key.mnc, Cell.lac == cell_key.lac, Cell.cid == cell_key.cid, Cell.psc == cell_key.psc) curr = query.first() if curr is not None: space_available[cell_key] = max_measures_per_cell - curr[0] else: space_available[cell_key] = max_measures_per_cell if space_available[cell_key] > 0: space_available[cell_key] -= 1 else: dropped_overflow += 1 continue # Possibly drop measure if we're receiving them too # quickly for this cell. query = session.query(Cell.total_measures).filter( Cell.radio == cell_measure.radio, Cell.mcc == cell_measure.mcc, Cell.mnc == cell_measure.mnc, Cell.lac == cell_measure.lac, Cell.cid == cell_measure.cid, Cell.psc == cell_measure.psc) total_measures = query.first() if total_measures is not None: if total_measures[0] > max_measures_per_cell: dropped_overflow += 1 continue cell_measures.append(cell_measure) # group per unique cell cell_count[cell_key] += 1 heka_client = get_heka_client() if dropped_malformed != 0: heka_client.incr("items.dropped.cell_ingress_malformed", count=dropped_malformed) if dropped_overflow != 0: heka_client.incr("items.dropped.cell_ingress_overflow", count=dropped_overflow) # update new/total measure counts new_cells = 0 for cell_key, count in cell_count.items(): new_cells += update_cell_measure_count( cell_key, count, utcnow, session) # update user score if userid is not None and new_cells > 0: process_score(userid, new_cells, session, key='new_cell') heka_client.incr("items.inserted.cell_measures", count=len(cell_measures)) session.add_all(cell_measures) return cell_measures
def process_station_measures(session, entries, station_type, station_model, measure_model, blacklist_model, create_measure, create_key, join_key, userid=None, max_measures_per_station=11000, utcnow=None): all_measures = [] dropped_blacklisted = 0 dropped_malformed = 0 dropped_overflow = 0 stats_client = get_stats_client() new_stations = 0 if utcnow is None: utcnow = util.utcnow() elif isinstance(utcnow, basestring): utcnow = decode_datetime(utcnow) # Process entries and group by validated station key station_measures = defaultdict(list) for entry in entries: measure = create_measure(utcnow, entry) if not measure: dropped_malformed += 1 continue station_measures[create_key(measure)].append(measure) # Process measures one station at a time for key, measures in station_measures.items(): incomplete = False is_new_station = False # Figure out how much space is left for this station. free = available_station_space(session, key, station_model, join_key, max_measures_per_station) if free is None: is_new_station = True free = max_measures_per_station if is_new_station: # Drop measures for blacklisted stations. if blacklisted_station(session, key, blacklist_model, join_key, utcnow): dropped_blacklisted += len(measures) continue incomplete = incomplete_measure(key) if not incomplete: # We discovered an actual new complete station. new_stations += 1 # Accept measures up to input-throttling limit, then drop. num = 0 for measure in measures: if free <= 0: dropped_overflow += 1 continue all_measures.append(measure) free -= 1 num += 1 # Accept incomplete measures, just don't make stations for them. # (station creation is a side effect of count-updating) if not incomplete and num > 0: create_or_update_station(session, key, station_model, join_key, utcnow, num) # Credit the user with discovering any new stations. if userid is not None and new_stations > 0: process_score(userid, new_stations, session, key='new_' + station_type) if dropped_blacklisted != 0: stats_client.incr( "items.dropped.%s_ingress_blacklisted" % station_type, count=dropped_blacklisted) if dropped_malformed != 0: stats_client.incr( "items.dropped.%s_ingress_malformed" % station_type, count=dropped_malformed) if dropped_overflow != 0: stats_client.incr( "items.dropped.%s_ingress_overflow" % station_type, count=dropped_overflow) stats_client.incr( "items.inserted.%s_measures" % station_type, count=len(all_measures)) session.add_all(all_measures) return all_measures
def process_mapstat(measures, session, userid=None): # 10x10 meter tiles tile_count = process_mapstat_keyed(1000, MAPSTAT_TYPE["location"], measures, session) if userid is not None and tile_count > 0: process_score(userid, tile_count, session, key="new_location")
def process_cell_measures(session, entries, userid=None, max_measures_per_cell=11000): cell_count = defaultdict(int) cell_measures = [] utcnow = datetime.datetime.utcnow().replace(tzinfo=iso8601.UTC) dropped_malformed = 0 dropped_overflow = 0 space_available = {} # process entries for entry in entries: cell_measure = create_cell_measure(utcnow, entry) if not cell_measure: dropped_malformed += 1 continue cell_key = to_cellkey_psc(cell_measure) # check if there's space for new measurement within per-cell maximum # note: old measures gradually expire, so this is an intake-rate limit if cell_key not in space_available: query = session.query(Cell.total_measures).filter( *join_cellkey(Cell, cell_key)) curr = query.first() if curr is not None: space_available[cell_key] = max_measures_per_cell - curr[0] else: space_available[cell_key] = max_measures_per_cell if space_available[cell_key] > 0: space_available[cell_key] -= 1 else: dropped_overflow += 1 continue # Possibly drop measure if we're receiving them too # quickly for this cell. query = session.query(Cell.total_measures).filter( *join_cellkey(Cell, cell_key)) total_measures = query.first() if total_measures is not None: if total_measures[0] > max_measures_per_cell: dropped_overflow += 1 continue cell_measures.append(cell_measure) # group per unique cell cell_count[cell_key] += 1 heka_client = get_heka_client() if dropped_malformed != 0: heka_client.incr("items.dropped.cell_ingress_malformed", count=dropped_malformed) if dropped_overflow != 0: heka_client.incr("items.dropped.cell_ingress_overflow", count=dropped_overflow) # update new/total measure counts new_cells = 0 for cell_key, count in cell_count.items(): new_cells += update_cell_measure_count( cell_key, count, utcnow, session) # update user score if userid is not None and new_cells > 0: process_score(userid, new_cells, session, key='new_cell') heka_client.incr("items.inserted.cell_measures", count=len(cell_measures)) session.add_all(cell_measures) return cell_measures
def process_measures(items, session, userid=None): stats_client = get_stats_client() utcnow = util.utcnow() utcmin = utcnow - datetime.timedelta(60) positions = [] cell_measures = [] wifi_measures = [] for i, item in enumerate(items): item = process_time(item, utcnow, utcmin) report_id = uuid.uuid1().hex cell, wifi = process_measure(report_id, item, session) cell_measures.extend(cell) wifi_measures.extend(wifi) if cell or wifi: positions.append({ 'lat': item['lat'], 'lon': item['lon'], }) if cell_measures: # group by and create task per cell key stats_client.incr("items.uploaded.cell_measures", len(cell_measures)) cells = defaultdict(list) for measure in cell_measures: cells[to_cellkey_psc(measure)].append(measure) for values in cells.values(): # insert measures, expire the task if it wasn't processed # after two hours to avoid queue overload insert_cell_measures.apply_async( args=[values], kwargs={'userid': userid}, expires=7200) if wifi_measures: # group by WiFi key stats_client.incr("items.uploaded.wifi_measures", len(wifi_measures)) wifis = defaultdict(list) for measure in wifi_measures: wifis[measure['key']].append(measure) # Create a task per group of 5 WiFi keys at a time. # We tend to get a huge number of unique WiFi networks per # batch upload, with one to very few measures per WiFi. # Grouping them helps in avoiding per-task overhead. wifis = list(wifis.values()) batch_size = 5 for i in range(0, len(wifis), batch_size): values = [] for measures in wifis[i:i + batch_size]: values.extend(measures) # insert measures, expire the task if it wasn't processed # after two hours to avoid queue overload insert_wifi_measures.apply_async( args=[values], kwargs={'userid': userid}, expires=7200) if userid is not None: process_score(userid, len(positions), session) if positions: process_mapstat(session, utcnow, positions)