def new_stations(self, name, station_keys): if len(station_keys) == 0: return 0 # assume all stations are unknown unknown_keys = set(station_keys) if name == 'wifi': # there is only one combined table structure shards = defaultdict(list) for mac in unknown_keys: shards[WifiShard.shard_model(mac)].append(mac) for shard, macs in shards.items(): query = (self.session.query(shard.mac) .filter(shard.mac.in_(macs))) unknown_keys -= set([r.mac for r in query.all()]) elif name == 'cell': # first check the station table, which is more likely to contain # stations station_iter = Cell.iterkeys( self.session, list(unknown_keys), # only load the columns required for the hashkey extra=lambda query: query.options( load_only(*tuple(Cell._hashkey_cls._fields)))) # subtract all stations which are found in the station table unknown_keys -= set([sta.hashkey() for sta in station_iter]) if len(unknown_keys) == 0: # pragma: no cover return 0 # Only check the blocklist table for the still unknown keys. # There is no need to check for the already found keys again. block_iter = CellBlocklist.iterkeys( self.session, list(unknown_keys), # only load the columns required for the hashkey extra=lambda query: query.options( load_only(*tuple(CellBlocklist._hashkey_cls._fields)))) # subtract all stations which are found in the blocklist table unknown_keys -= set([block.hashkey() for block in block_iter]) return len(unknown_keys)
def __call__(self, batch=10): all_observations = self.data_queue.dequeue(batch=batch) drop_counter = defaultdict(int) added = 0 new_stations = 0 station_obs = defaultdict(list) for obs in all_observations: station_obs[Cell.to_hashkey(obs)].append(obs) if not station_obs: return (0, 0) stations = {} for station in Cell.iterkeys(self.session, list(station_obs.keys())): stations[station.hashkey()] = station blocklist = self.blocklisted_stations(station_obs.keys()) new_station_values = [] changed_station_values = [] moving_stations = set() for station_key, observations in station_obs.items(): blocked, first_blocked, block = blocklist.get( station_key, (False, None, None)) if not any(observations): continue if blocked: # Drop observations for blocklisted stations. drop_counter['blocklisted'] += len(observations) continue station = stations.get(station_key, None) if station is None and not first_blocked: # We discovered an actual new never before seen station. new_stations += 1 moving, new_values, changed_values = self.new_station_values( station, station_key, first_blocked, observations) if moving: moving_stations.add((station_key, block)) else: added += len(observations) if new_values: new_station_values.append(new_values) if changed_values: changed_station_values.append(changed_values) # track potential updates to dependent areas self.add_area_update(station_key) if new_station_values: # do a batch insert of new stations stmt = Cell.__table__.insert( mysql_on_duplicate='total_measures = total_measures' # no-op ) # but limit the batch depending on each model ins_batch = Cell._insert_batch for i in range(0, len(new_station_values), ins_batch): batch_values = new_station_values[i:i + ins_batch] self.session.execute(stmt.values(batch_values)) if changed_station_values: # do a batch update of changed stations ins_batch = Cell._insert_batch for i in range(0, len(changed_station_values), ins_batch): batch_values = changed_station_values[i:i + ins_batch] self.session.bulk_update_mappings(Cell, batch_values) if self.updated_areas: self.queue_area_updates() if moving_stations: self.blocklist_stations(moving_stations) self.emit_stats(added, drop_counter) self.emit_statcounters(added, new_stations) if self.data_queue.enough_data(batch=batch): # pragma: no cover self.update_task.apply_async( kwargs={'batch': batch}, countdown=2, expires=10) return (len(stations) + len(new_station_values), len(moving_stations))