Ejemplo n.º 1
0
    def new_stations(self, name, station_keys):
        if len(station_keys) == 0:
            return 0

        # assume all stations are unknown
        unknown_keys = set(station_keys)

        if name == 'wifi':
            # there is only one combined table structure
            shards = defaultdict(list)
            for mac in unknown_keys:
                shards[WifiShard.shard_model(mac)].append(mac)
            for shard, macs in shards.items():
                query = (self.session.query(shard.mac)
                                     .filter(shard.mac.in_(macs)))
                unknown_keys -= set([r.mac for r in query.all()])
        elif name == 'cell':
            # first check the station table, which is more likely to contain
            # stations
            station_iter = Cell.iterkeys(
                self.session,
                list(unknown_keys),
                # only load the columns required for the hashkey
                extra=lambda query: query.options(
                    load_only(*tuple(Cell._hashkey_cls._fields))))
            # subtract all stations which are found in the station table
            unknown_keys -= set([sta.hashkey() for sta in station_iter])
            if len(unknown_keys) == 0:  # pragma: no cover
                return 0

            # Only check the blocklist table for the still unknown keys.
            # There is no need to check for the already found keys again.
            block_iter = CellBlocklist.iterkeys(
                self.session,
                list(unknown_keys),
                # only load the columns required for the hashkey
                extra=lambda query: query.options(
                    load_only(*tuple(CellBlocklist._hashkey_cls._fields))))
            # subtract all stations which are found in the blocklist table
            unknown_keys -= set([block.hashkey() for block in block_iter])

        return len(unknown_keys)
Ejemplo n.º 2
0
    def __call__(self, batch=10):
        all_observations = self.data_queue.dequeue(batch=batch)
        drop_counter = defaultdict(int)
        added = 0
        new_stations = 0
        station_obs = defaultdict(list)

        for obs in all_observations:
            station_obs[Cell.to_hashkey(obs)].append(obs)

        if not station_obs:
            return (0, 0)

        stations = {}
        for station in Cell.iterkeys(self.session, list(station_obs.keys())):
            stations[station.hashkey()] = station

        blocklist = self.blocklisted_stations(station_obs.keys())

        new_station_values = []
        changed_station_values = []
        moving_stations = set()
        for station_key, observations in station_obs.items():
            blocked, first_blocked, block = blocklist.get(
                station_key, (False, None, None))

            if not any(observations):
                continue

            if blocked:
                # Drop observations for blocklisted stations.
                drop_counter['blocklisted'] += len(observations)
                continue

            station = stations.get(station_key, None)
            if station is None and not first_blocked:
                # We discovered an actual new never before seen station.
                new_stations += 1

            moving, new_values, changed_values = self.new_station_values(
                station, station_key, first_blocked, observations)
            if moving:
                moving_stations.add((station_key, block))
            else:
                added += len(observations)
                if new_values:
                    new_station_values.append(new_values)
                if changed_values:
                    changed_station_values.append(changed_values)

            # track potential updates to dependent areas
            self.add_area_update(station_key)

        if new_station_values:
            # do a batch insert of new stations
            stmt = Cell.__table__.insert(
                mysql_on_duplicate='total_measures = total_measures'  # no-op
            )
            # but limit the batch depending on each model
            ins_batch = Cell._insert_batch
            for i in range(0, len(new_station_values), ins_batch):
                batch_values = new_station_values[i:i + ins_batch]
                self.session.execute(stmt.values(batch_values))

        if changed_station_values:
            # do a batch update of changed stations
            ins_batch = Cell._insert_batch
            for i in range(0, len(changed_station_values), ins_batch):
                batch_values = changed_station_values[i:i + ins_batch]
                self.session.bulk_update_mappings(Cell, batch_values)

        if self.updated_areas:
            self.queue_area_updates()

        if moving_stations:
            self.blocklist_stations(moving_stations)

        self.emit_stats(added, drop_counter)
        self.emit_statcounters(added, new_stations)

        if self.data_queue.enough_data(batch=batch):  # pragma: no cover
            self.update_task.apply_async(
                kwargs={'batch': batch},
                countdown=2,
                expires=10)

        return (len(stations) + len(new_station_values), len(moving_stations))