def process_measure(report_id, data, session): def add_missing_dict_entries(dst, src): # x.update(y) overwrites entries in x with those in y; # we want to only add those not already present for (k, v) in src.items(): if k not in dst: dst[k] = v cell_measures = {} wifi_measures = {} measure_data = dict( report_id=report_id, lat=data['lat'], lon=data['lon'], heading=data.get('heading', -1.0), speed=data.get('speed', -1.0), time=encode_datetime(data['time']), accuracy=data.get('accuracy', 0), altitude=data.get('altitude', 0), altitude_accuracy=data.get('altitude_accuracy', 0), ) measure_radio = RADIO_TYPE.get(data['radio'], -1) if data.get('cell'): # flatten measure / cell data into a single dict for c in data['cell']: add_missing_dict_entries(c, measure_data) c = normalized_cell_measure_dict(c, measure_radio) if c is None: continue key = to_cellkey_psc(c) if key in cell_measures: existing = cell_measures[key] if existing['ta'] > c['ta'] or \ (existing['signal'] != 0 and existing['signal'] < c['signal']) or \ existing['asu'] < c['asu']: cell_measures[key] = c else: cell_measures[key] = c cell_measures = cell_measures.values() # flatten measure / wifi data into a single dict if data.get('wifi'): for w in data['wifi']: add_missing_dict_entries(w, measure_data) w = normalized_wifi_measure_dict(w) if w is None: continue key = w['key'] if key in wifi_measures: existing = wifi_measures[key] if existing['signal'] != 0 and \ existing['signal'] < w['signal']: wifi_measures[key] = w else: wifi_measures[key] = w wifi_measures = wifi_measures.values() return (cell_measures, wifi_measures)
def process_measures(items, session, userid=None): utcnow = datetime.datetime.utcnow().replace(tzinfo=iso8601.UTC) utcmin = utcnow - datetime.timedelta(60) # get enough auto-increment ids assigned measures = [] for i in range(len(items)): measure = Measure() measures.append(measure) session.add(measure) # TODO switch unique measure id to a uuid, so we don't have to do # get these from a savepoint here session.flush() positions = [] cell_measures = [] wifi_measures = [] for i, item in enumerate(items): item = process_time(item, utcnow, utcmin) cell, wifi = process_measure(measures[i].id, item, session) cell_measures.extend(cell) wifi_measures.extend(wifi) positions.append({ 'lat': to_precise_int(item['lat']), 'lon': to_precise_int(item['lon']), }) heka_client = get_heka_client() if cell_measures: # group by and create task per cell key heka_client.incr("items.uploaded.cell_measures", len(cell_measures)) cells = defaultdict(list) for measure in cell_measures: cells[to_cellkey_psc(measure)].append(measure) for values in cells.values(): insert_cell_measures.delay(values, userid=userid) if wifi_measures: # group by and create task per wifi key heka_client.incr("items.uploaded.wifi_measures", len(wifi_measures)) wifis = defaultdict(list) for measure in wifi_measures: wifis[measure['key']].append(measure) for values in wifis.values(): insert_wifi_measures.delay(values, userid=userid) if userid is not None: process_score(userid, len(items), session) if positions: process_mapstat(positions, session, userid=userid)
def process_measure(data, session): def add_missing_dict_entries(dst, src): # x.update(y) overwrites entries in x with those in y; # We want to only add those not already present. # We also only want to copy the top-level base measure data # and not any nested values like cell or wifi. for (k, v) in src.items(): if k != 'radio' and k not in dst \ and not isinstance(v, (tuple, list, dict)): dst[k] = v measure_data = normalized_measure_dict(data) if measure_data is None: return ([], []) cell_measures = {} wifi_measures = {} measure_radio = RADIO_TYPE.get(data['radio'], -1) if data.get('cell'): # flatten measure / cell data into a single dict for c in data['cell']: add_missing_dict_entries(c, measure_data) c = normalized_cell_measure_dict(c, measure_radio) if c is None: # pragma: no cover continue key = to_cellkey_psc(c) if key in cell_measures: # pragma: no cover existing = cell_measures[key] if existing['ta'] > c['ta'] or \ (existing['signal'] != 0 and existing['signal'] < c['signal']) or \ existing['asu'] < c['asu']: cell_measures[key] = c else: cell_measures[key] = c cell_measures = cell_measures.values() # flatten measure / wifi data into a single dict if data.get('wifi'): for w in data['wifi']: add_missing_dict_entries(w, measure_data) w = normalized_wifi_measure_dict(w) if w is None: continue key = w['key'] if key in wifi_measures: # pragma: no cover existing = wifi_measures[key] if existing['signal'] != 0 and \ existing['signal'] < w['signal']: wifi_measures[key] = w else: wifi_measures[key] = w wifi_measures = wifi_measures.values() return (cell_measures, wifi_measures)
def process_cell_measures(session, entries, userid=None, max_measures_per_cell=11000): cell_count = defaultdict(int) cell_measures = [] utcnow = datetime.datetime.utcnow().replace(tzinfo=iso8601.UTC) dropped_malformed = 0 dropped_overflow = 0 space_available = {} # process entries for entry in entries: cell_measure = create_cell_measure(utcnow, entry) if not cell_measure: dropped_malformed += 1 continue cell_key = to_cellkey_psc(cell_measure) # check if there's space for new measurement within per-cell maximum # note: old measures gradually expire, so this is an intake-rate limit if cell_key not in space_available: query = session.query(Cell.total_measures).filter( Cell.radio == cell_key.radio, Cell.mcc == cell_key.mcc, Cell.mnc == cell_key.mnc, Cell.lac == cell_key.lac, Cell.cid == cell_key.cid, Cell.psc == cell_key.psc) curr = query.first() if curr is not None: space_available[cell_key] = max_measures_per_cell - curr[0] else: space_available[cell_key] = max_measures_per_cell if space_available[cell_key] > 0: space_available[cell_key] -= 1 else: dropped_overflow += 1 continue # Possibly drop measure if we're receiving them too # quickly for this cell. query = session.query(Cell.total_measures).filter( Cell.radio == cell_measure.radio, Cell.mcc == cell_measure.mcc, Cell.mnc == cell_measure.mnc, Cell.lac == cell_measure.lac, Cell.cid == cell_measure.cid, Cell.psc == cell_measure.psc) total_measures = query.first() if total_measures is not None: if total_measures[0] > max_measures_per_cell: dropped_overflow += 1 continue cell_measures.append(cell_measure) # group per unique cell cell_count[cell_key] += 1 heka_client = get_heka_client() if dropped_malformed != 0: heka_client.incr("items.dropped.cell_ingress_malformed", count=dropped_malformed) if dropped_overflow != 0: heka_client.incr("items.dropped.cell_ingress_overflow", count=dropped_overflow) # update new/total measure counts new_cells = 0 for cell_key, count in cell_count.items(): new_cells += update_cell_measure_count( cell_key, count, utcnow, session) # update user score if userid is not None and new_cells > 0: process_score(userid, new_cells, session, key='new_cell') heka_client.incr("items.inserted.cell_measures", count=len(cell_measures)) session.add_all(cell_measures) return cell_measures
def process_cell_measures(session, entries, userid=None, max_measures_per_cell=11000): cell_count = defaultdict(int) cell_measures = [] utcnow = datetime.datetime.utcnow().replace(tzinfo=iso8601.UTC) dropped_malformed = 0 dropped_overflow = 0 space_available = {} # process entries for entry in entries: cell_measure = create_cell_measure(utcnow, entry) if not cell_measure: dropped_malformed += 1 continue cell_key = to_cellkey_psc(cell_measure) # check if there's space for new measurement within per-cell maximum # note: old measures gradually expire, so this is an intake-rate limit if cell_key not in space_available: query = session.query(Cell.total_measures).filter( *join_cellkey(Cell, cell_key)) curr = query.first() if curr is not None: space_available[cell_key] = max_measures_per_cell - curr[0] else: space_available[cell_key] = max_measures_per_cell if space_available[cell_key] > 0: space_available[cell_key] -= 1 else: dropped_overflow += 1 continue # Possibly drop measure if we're receiving them too # quickly for this cell. query = session.query(Cell.total_measures).filter( *join_cellkey(Cell, cell_key)) total_measures = query.first() if total_measures is not None: if total_measures[0] > max_measures_per_cell: dropped_overflow += 1 continue cell_measures.append(cell_measure) # group per unique cell cell_count[cell_key] += 1 heka_client = get_heka_client() if dropped_malformed != 0: heka_client.incr("items.dropped.cell_ingress_malformed", count=dropped_malformed) if dropped_overflow != 0: heka_client.incr("items.dropped.cell_ingress_overflow", count=dropped_overflow) # update new/total measure counts new_cells = 0 for cell_key, count in cell_count.items(): new_cells += update_cell_measure_count( cell_key, count, utcnow, session) # update user score if userid is not None and new_cells > 0: process_score(userid, new_cells, session, key='new_cell') heka_client.incr("items.inserted.cell_measures", count=len(cell_measures)) session.add_all(cell_measures) return cell_measures
def process_measures(items, session, userid=None): stats_client = get_stats_client() utcnow = util.utcnow() utcmin = utcnow - datetime.timedelta(60) positions = [] cell_measures = [] wifi_measures = [] for i, item in enumerate(items): item = process_time(item, utcnow, utcmin) report_id = uuid.uuid1().hex cell, wifi = process_measure(report_id, item, session) cell_measures.extend(cell) wifi_measures.extend(wifi) if cell or wifi: positions.append({ 'lat': item['lat'], 'lon': item['lon'], }) if cell_measures: # group by and create task per cell key stats_client.incr("items.uploaded.cell_measures", len(cell_measures)) cells = defaultdict(list) for measure in cell_measures: cells[to_cellkey_psc(measure)].append(measure) for values in cells.values(): # insert measures, expire the task if it wasn't processed # after two hours to avoid queue overload insert_cell_measures.apply_async( args=[values], kwargs={'userid': userid}, expires=7200) if wifi_measures: # group by WiFi key stats_client.incr("items.uploaded.wifi_measures", len(wifi_measures)) wifis = defaultdict(list) for measure in wifi_measures: wifis[measure['key']].append(measure) # Create a task per group of 5 WiFi keys at a time. # We tend to get a huge number of unique WiFi networks per # batch upload, with one to very few measures per WiFi. # Grouping them helps in avoiding per-task overhead. wifis = list(wifis.values()) batch_size = 5 for i in range(0, len(wifis), batch_size): values = [] for measures in wifis[i:i + batch_size]: values.extend(measures) # insert measures, expire the task if it wasn't processed # after two hours to avoid queue overload insert_wifi_measures.apply_async( args=[values], kwargs={'userid': userid}, expires=7200) if userid is not None: process_score(userid, len(positions), session) if positions: process_mapstat(session, utcnow, positions)
def process_measures(items, session, userid=None): stats_client = get_stats_client() positions = [] cell_measures = [] wifi_measures = [] for i, item in enumerate(items): item['report_id'] = uuid.uuid1().hex cell, wifi = process_measure(item, session) cell_measures.extend(cell) wifi_measures.extend(wifi) if cell or wifi: positions.append({ 'lat': item['lat'], 'lon': item['lon'], }) if cell_measures: # group by and create task per cell key stats_client.incr("items.uploaded.cell_measures", len(cell_measures)) cells = defaultdict(list) for measure in cell_measures: cells[to_cellkey_psc(measure)].append(measure) # Create a task per group of 5 cell keys at a time. # Grouping them helps in avoiding per-task overhead. cells = list(cells.values()) batch_size = 5 countdown = 0 for i in range(0, len(cells), batch_size): values = [] for measures in cells[i:i + batch_size]: values.extend(measures) # insert measures, expire the task if it wasn't processed # after six hours to avoid queue overload, also delay # each task by one second more, to get a more even workload # and avoid parallel updates of the same underlying stations insert_measures_cell.apply_async( args=[values], kwargs={'userid': userid}, expires=21600, countdown=countdown) countdown += 1 if wifi_measures: # group by WiFi key stats_client.incr("items.uploaded.wifi_measures", len(wifi_measures)) wifis = defaultdict(list) for measure in wifi_measures: wifis[measure['key']].append(measure) # Create a task per group of 20 WiFi keys at a time. # We tend to get a huge number of unique WiFi networks per # batch upload, with one to very few measures per WiFi. # Grouping them helps in avoiding per-task overhead. wifis = list(wifis.values()) batch_size = 20 countdown = 0 for i in range(0, len(wifis), batch_size): values = [] for measures in wifis[i:i + batch_size]: values.extend(measures) # insert measures, expire the task if it wasn't processed # after six hours to avoid queue overload, also delay # each task by one second more, to get a more even workload # and avoid parallel updates of the same underlying stations insert_measures_wifi.apply_async( args=[values], kwargs={'userid': userid}, expires=21600, countdown=countdown) countdown += 1 if userid is not None: process_score(userid, len(positions), session) if positions: process_mapstat(session, positions)
def process_measures(items, session, userid=None): stats_client = get_stats_client() positions = [] cell_measures = [] wifi_measures = [] for i, item in enumerate(items): item['report_id'] = uuid.uuid1().hex cell, wifi = process_measure(item, session) cell_measures.extend(cell) wifi_measures.extend(wifi) if cell or wifi: positions.append({ 'lat': item['lat'], 'lon': item['lon'], }) if cell_measures: # group by and create task per cell key stats_client.incr("items.uploaded.cell_measures", len(cell_measures)) cells = defaultdict(list) for measure in cell_measures: cells[to_cellkey_psc(measure)].append(measure) # Create a task per group of 5 cell keys at a time. # Grouping them helps in avoiding per-task overhead. cells = list(cells.values()) batch_size = 5 countdown = 0 for i in range(0, len(cells), batch_size): values = [] for measures in cells[i:i + batch_size]: values.extend(measures) # insert measures, expire the task if it wasn't processed # after six hours to avoid queue overload, also delay # each task by one second more, to get a more even workload # and avoid parallel updates of the same underlying stations insert_measures_cell.apply_async(args=[values], kwargs={'userid': userid}, expires=21600, countdown=countdown) countdown += 1 if wifi_measures: # group by WiFi key stats_client.incr("items.uploaded.wifi_measures", len(wifi_measures)) wifis = defaultdict(list) for measure in wifi_measures: wifis[measure['key']].append(measure) # Create a task per group of 20 WiFi keys at a time. # We tend to get a huge number of unique WiFi networks per # batch upload, with one to very few measures per WiFi. # Grouping them helps in avoiding per-task overhead. wifis = list(wifis.values()) batch_size = 20 countdown = 0 for i in range(0, len(wifis), batch_size): values = [] for measures in wifis[i:i + batch_size]: values.extend(measures) # insert measures, expire the task if it wasn't processed # after six hours to avoid queue overload, also delay # each task by one second more, to get a more even workload # and avoid parallel updates of the same underlying stations insert_measures_wifi.apply_async(args=[values], kwargs={'userid': userid}, expires=21600, countdown=countdown) countdown += 1 if userid is not None: process_score(userid, len(positions), session) if positions: process_mapstat(session, positions)