def process_score(self, userid, positions, new_station_count): if userid is None or len(positions) <= 0: return queue = self.task.app.data_queues['update_score'] scores = [] key = Score.to_hashkey( userid=userid, key=ScoreKey.location, time=None) scores.append({'hashkey': key, 'value': len(positions)}) for name, score_key in (('cell', ScoreKey.new_cell), ('wifi', ScoreKey.new_wifi)): count = new_station_count[name] if count <= 0: continue key = Score.to_hashkey( userid=userid, key=score_key, time=None) scores.append({'hashkey': key, 'value': count}) queue.enqueue(scores)
def insert(self, entries, userid=None): all_observations = [] drop_counter = defaultdict(int) new_stations = 0 # Process entries and group by validated station key station_observations = defaultdict(list) for entry in entries: self.pre_process_entry(entry) obs = self.observation_model.create(**entry) if not obs: drop_counter['malformed'] += 1 continue station_observations[obs.hashkey()].append(obs) # Process observations one station at a time for key, observations in station_observations.items(): first_blacklisted = None incomplete = False station = self.station_model.querykey(self.session, key).first() if station is None: # Drop observations for blacklisted stations. blacklisted, first_blacklisted = self.blacklisted_station(key) if blacklisted: drop_counter['blacklisted'] += len(observations) continue incomplete = self.incomplete_observation(key) if not incomplete: # We discovered an actual new complete station. new_stations += 1 # Accept all observations all_observations.extend(observations) num = len(observations) # Accept incomplete observations, just don't make stations for them # (station creation is a side effect of count-updating) if not incomplete and num > 0: self.create_or_update_station(station, key, num, first_blacklisted) # Credit the user with discovering any new stations. if userid is not None and new_stations > 0: scorekey = Score.to_hashkey( userid=userid, key=ScoreKey['new_' + self.station_type], time=self.utcnow.date()) Score.incr(self.session, scorekey, new_stations) added = len(all_observations) self.emit_stats(added, drop_counter) self.session.add_all(all_observations) return added
def insert(self, entries, userid=None): all_observations = [] drop_counter = defaultdict(int) new_stations = 0 # Process entries and group by validated station key station_observations = defaultdict(list) for entry in entries: self.pre_process_entry(entry) obs = self.observation_model.create(**entry) if not obs: drop_counter['malformed'] += 1 continue station_observations[obs.hashkey()].append(obs) # Process observations one station at a time for key, observations in station_observations.items(): first_blacklisted = None incomplete = False station = self.station_model.getkey(self.session, key) if station is None: # Drop observations for blacklisted stations. blacklisted, first_blacklisted = self.blacklisted_station(key) if blacklisted: drop_counter['blacklisted'] += len(observations) continue incomplete = self.incomplete_observation(key) if not incomplete: # We discovered an actual new complete station. new_stations += 1 # Accept all observations all_observations.extend(observations) num = len(observations) # Accept incomplete observations, just don't make stations for them # (station creation is a side effect of count-updating) if not incomplete and num > 0: self.create_or_update_station(station, key, num, first_blacklisted) # Credit the user with discovering any new stations. if userid is not None and new_stations > 0: scorekey = Score.to_hashkey( userid=userid, key=ScoreKey['new_' + self.station_type], time=self.utcnow.date()) Score.incr(self.session, scorekey, new_stations) added = len(all_observations) self.emit_stats(added, drop_counter) self.session.add_all(all_observations) return added
def queue_scores(self, userid, new_stations): # Credit the user with discovering any new stations. if userid is None or new_stations <= 0: return queue = self.task.app.data_queues["update_score"] key = Score.to_hashkey(userid=userid, key=self.station_score, time=None) queue.enqueue([{"hashkey": key, "value": int(new_stations)}])
def process_score(self, userid, positions): if userid is None or len(positions) <= 0: return queue = self.task.app.data_queues['update_score'] key = Score.to_hashkey( userid=userid, key=ScoreKey.location, time=None) queue.enqueue([{'hashkey': key, 'value': len(positions)}])
def process_reports(self, reports, userid=None): positions = [] cell_observations = [] wifi_observations = [] for i, report in enumerate(reports): report['report_id'] = uuid.uuid1() cell, wifi = self.process_report(report) cell_observations.extend(cell) wifi_observations.extend(wifi) if cell or wifi: positions.append({ 'lat': report['lat'], 'lon': report['lon'], }) if cell_observations: # group by and create task per cell key self.stats_client.incr('items.uploaded.cell_observations', len(cell_observations)) if self.api_key_log: self.stats_client.incr( 'items.api_log.%s.uploaded.' 'cell_observations' % self.api_key_name, len(cell_observations)) cells = defaultdict(list) for obs in cell_observations: cells[CellObservation.to_hashkey(obs)].append(obs) # Create a task per group of 5 cell keys at a time. # Grouping them helps in avoiding per-task overhead. cells = list(cells.values()) batch_size = 5 countdown = 0 for i in range(0, len(cells), batch_size): values = [] for observations in cells[i:i + batch_size]: values.extend([encode_radio_dict(o) for o in observations]) # insert observations, expire the task if it wasn't processed # after six hours to avoid queue overload, also delay # each task by one second more, to get a more even workload # and avoid parallel updates of the same underlying stations self.insert_cell_task.apply_async( args=[values], kwargs={'userid': userid}, expires=21600, countdown=countdown) countdown += 1 if wifi_observations: # group by WiFi key self.stats_client.incr('items.uploaded.wifi_observations', len(wifi_observations)) if self.api_key_log: self.stats_client.incr( 'items.api_log.%s.uploaded.' 'wifi_observations' % self.api_key_name, len(wifi_observations)) wifis = defaultdict(list) for obs in wifi_observations: wifis[WifiObservation.to_hashkey(obs)].append(obs) # Create a task per group of 20 WiFi keys at a time. # We tend to get a huge number of unique WiFi networks per # batch upload, with one to very few observations per WiFi. # Grouping them helps in avoiding per-task overhead. wifis = list(wifis.values()) batch_size = 20 countdown = 0 for i in range(0, len(wifis), batch_size): values = [] for observations in wifis[i:i + batch_size]: values.extend(observations) # insert observations, expire the task if it wasn't processed # after six hours to avoid queue overload, also delay # each task by one second more, to get a more even workload # and avoid parallel updates of the same underlying stations self.insert_wifi_task.apply_async( args=[values], kwargs={'userid': userid}, expires=21600, countdown=countdown) countdown += 1 if userid is not None: scorekey = Score.to_hashkey( userid=userid, key=ScoreKey.location, time=util.utcnow().date()) Score.incr(self.session, scorekey, len(positions)) if positions: self.process_mapstat(positions)
def test_insert_observations_invalid_lac(self): session = self.session schema = ValidCellKeySchema() time = util.utcnow() - timedelta(days=1) today = util.utcnow().date() session.add( Cell(radio=Radio.gsm, mcc=FRANCE_MCC, mnc=2, lac=3, cid=4, new_measures=2, total_measures=5)) session.add(Score(key=ScoreKey.new_cell, userid=1, time=today, value=7)) session.flush() obs = dict(created=time, lat=PARIS_LAT, lon=PARIS_LON, time=time, accuracy=0, altitude=0, altitude_accuracy=0, radio=int(Radio.gsm)) entries = [ { "mcc": FRANCE_MCC, "mnc": 2, "lac": constants.MAX_LAC_ALL + 1, "cid": constants.MAX_CID_ALL + 1, "psc": 5, "asu": 8 }, { "mcc": FRANCE_MCC, "mnc": 2, "lac": schema.fields['lac'].missing, "cid": schema.fields['cid'].missing, "psc": 5, "asu": 8 }, ] for e in entries: e.update(obs) result = insert_measures_cell.delay(entries, userid=1) self.assertEqual(result.get(), 2) observations = session.query(CellObservation).all() self.assertEqual(len(observations), 2) self.assertEqual(set([o.lac for o in observations]), set([schema.fields['lac'].missing])) self.assertEqual(set([o.cid for o in observations]), set([schema.fields['cid'].missing])) # Nothing should change in the initially created Cell record cells = session.query(Cell).all() self.assertEqual(len(cells), 1) self.assertEqual(set([c.new_measures for c in cells]), set([2])) self.assertEqual(set([c.total_measures for c in cells]), set([5]))