Ejemplo n.º 1
0
    def insert(self, entries, userid=None):
        all_observations = []
        drop_counter = defaultdict(int)
        new_stations = 0

        # Process entries and group by validated station key
        station_observations = defaultdict(list)
        for entry in entries:
            self.pre_process_entry(entry)

            obs = self.observation_model.create(**entry)
            if not obs:
                drop_counter['malformed'] += 1
                continue

            station_observations[obs.hashkey()].append(obs)

        # Process observations one station at a time
        for key, observations in station_observations.items():
            first_blacklisted = None
            incomplete = False
            station = self.station_model.querykey(self.session, key).first()

            if station is None:
                # Drop observations for blacklisted stations.
                blacklisted, first_blacklisted = self.blacklisted_station(key)
                if blacklisted:
                    drop_counter['blacklisted'] += len(observations)
                    continue

                incomplete = self.incomplete_observation(key)
                if not incomplete:
                    # We discovered an actual new complete station.
                    new_stations += 1

            # Accept all observations
            all_observations.extend(observations)
            num = len(observations)

            # Accept incomplete observations, just don't make stations for them
            # (station creation is a side effect of count-updating)
            if not incomplete and num > 0:
                self.create_or_update_station(station, key, num,
                                              first_blacklisted)

        # Credit the user with discovering any new stations.
        if userid is not None and new_stations > 0:
            scorekey = Score.to_hashkey(
                userid=userid,
                key=ScoreKey['new_' + self.station_type],
                time=self.utcnow.date())
            Score.incr(self.session, scorekey, new_stations)

        added = len(all_observations)
        self.emit_stats(added, drop_counter)

        self.session.add_all(all_observations)
        return added
Ejemplo n.º 2
0
    def insert(self, entries, userid=None):
        all_observations = []
        drop_counter = defaultdict(int)
        new_stations = 0

        # Process entries and group by validated station key
        station_observations = defaultdict(list)
        for entry in entries:
            self.pre_process_entry(entry)

            obs = self.observation_model.create(**entry)
            if not obs:
                drop_counter['malformed'] += 1
                continue

            station_observations[obs.hashkey()].append(obs)

        # Process observations one station at a time
        for key, observations in station_observations.items():
            first_blacklisted = None
            incomplete = False
            station = self.station_model.getkey(self.session, key)

            if station is None:
                # Drop observations for blacklisted stations.
                blacklisted, first_blacklisted = self.blacklisted_station(key)
                if blacklisted:
                    drop_counter['blacklisted'] += len(observations)
                    continue

                incomplete = self.incomplete_observation(key)
                if not incomplete:
                    # We discovered an actual new complete station.
                    new_stations += 1

            # Accept all observations
            all_observations.extend(observations)
            num = len(observations)

            # Accept incomplete observations, just don't make stations for them
            # (station creation is a side effect of count-updating)
            if not incomplete and num > 0:
                self.create_or_update_station(station, key, num,
                                              first_blacklisted)

        # Credit the user with discovering any new stations.
        if userid is not None and new_stations > 0:
            scorekey = Score.to_hashkey(
                userid=userid,
                key=ScoreKey['new_' + self.station_type],
                time=self.utcnow.date())
            Score.incr(self.session, scorekey, new_stations)

        added = len(all_observations)
        self.emit_stats(added, drop_counter)

        self.session.add_all(all_observations)
        return added
Ejemplo n.º 3
0
    def process_reports(self, reports, userid=None):
        positions = []
        cell_observations = []
        wifi_observations = []
        for i, report in enumerate(reports):
            report['report_id'] = uuid.uuid1()
            cell, wifi = self.process_report(report)
            cell_observations.extend(cell)
            wifi_observations.extend(wifi)
            if cell or wifi:
                positions.append({
                    'lat': report['lat'],
                    'lon': report['lon'],
                })

        if cell_observations:
            # group by and create task per cell key
            self.stats_client.incr('items.uploaded.cell_observations',
                                   len(cell_observations))
            if self.api_key_log:
                self.stats_client.incr(
                    'items.api_log.%s.uploaded.'
                    'cell_observations' % self.api_key_name,
                    len(cell_observations))

            cells = defaultdict(list)
            for obs in cell_observations:
                cells[CellObservation.to_hashkey(obs)].append(obs)

            # Create a task per group of 5 cell keys at a time.
            # Grouping them helps in avoiding per-task overhead.
            cells = list(cells.values())
            batch_size = 5
            countdown = 0
            for i in range(0, len(cells), batch_size):
                values = []
                for observations in cells[i:i + batch_size]:
                    values.extend([encode_radio_dict(o) for o in observations])
                # insert observations, expire the task if it wasn't processed
                # after six hours to avoid queue overload, also delay
                # each task by one second more, to get a more even workload
                # and avoid parallel updates of the same underlying stations
                self.insert_cell_task.apply_async(
                    args=[values],
                    kwargs={'userid': userid},
                    expires=21600,
                    countdown=countdown)
                countdown += 1

        if wifi_observations:
            # group by WiFi key
            self.stats_client.incr('items.uploaded.wifi_observations',
                                   len(wifi_observations))
            if self.api_key_log:
                self.stats_client.incr(
                    'items.api_log.%s.uploaded.'
                    'wifi_observations' % self.api_key_name,
                    len(wifi_observations))

            wifis = defaultdict(list)
            for obs in wifi_observations:
                wifis[WifiObservation.to_hashkey(obs)].append(obs)

            # Create a task per group of 20 WiFi keys at a time.
            # We tend to get a huge number of unique WiFi networks per
            # batch upload, with one to very few observations per WiFi.
            # Grouping them helps in avoiding per-task overhead.
            wifis = list(wifis.values())
            batch_size = 20
            countdown = 0
            for i in range(0, len(wifis), batch_size):
                values = []
                for observations in wifis[i:i + batch_size]:
                    values.extend(observations)
                # insert observations, expire the task if it wasn't processed
                # after six hours to avoid queue overload, also delay
                # each task by one second more, to get a more even workload
                # and avoid parallel updates of the same underlying stations
                self.insert_wifi_task.apply_async(
                    args=[values],
                    kwargs={'userid': userid},
                    expires=21600,
                    countdown=countdown)
                countdown += 1

        if userid is not None:
            scorekey = Score.to_hashkey(
                userid=userid,
                key=ScoreKey.location,
                time=util.utcnow().date())
            Score.incr(self.session, scorekey, len(positions))
        if positions:
            self.process_mapstat(positions)
Ejemplo n.º 4
0
    def process_reports(self, reports, userid=None):
        positions = []
        cell_observations = []
        wifi_observations = []
        for i, report in enumerate(reports):
            report['report_id'] = uuid.uuid1()
            cell, wifi = self.process_report(report)
            cell_observations.extend(cell)
            wifi_observations.extend(wifi)
            if cell or wifi:
                positions.append({
                    'lat': report['lat'],
                    'lon': report['lon'],
                })

        if cell_observations:
            # group by and create task per cell key
            self.stats_client.incr('items.uploaded.cell_observations',
                                   len(cell_observations))
            if self.api_key_log:
                self.stats_client.incr(
                    'items.api_log.%s.uploaded.'
                    'cell_observations' % self.api_key_name,
                    len(cell_observations))

            cells = defaultdict(list)
            for obs in cell_observations:
                cells[CellObservation.to_hashkey(obs)].append(obs)

            # Create a task per group of 5 cell keys at a time.
            # Grouping them helps in avoiding per-task overhead.
            cells = list(cells.values())
            batch_size = 5
            countdown = 0
            for i in range(0, len(cells), batch_size):
                values = []
                for observations in cells[i:i + batch_size]:
                    values.extend([encode_radio_dict(o) for o in observations])
                # insert observations, expire the task if it wasn't processed
                # after six hours to avoid queue overload, also delay
                # each task by one second more, to get a more even workload
                # and avoid parallel updates of the same underlying stations
                self.insert_cell_task.apply_async(
                    args=[values],
                    kwargs={'userid': userid},
                    expires=21600,
                    countdown=countdown)
                countdown += 1

        if wifi_observations:
            # group by WiFi key
            self.stats_client.incr('items.uploaded.wifi_observations',
                                   len(wifi_observations))
            if self.api_key_log:
                self.stats_client.incr(
                    'items.api_log.%s.uploaded.'
                    'wifi_observations' % self.api_key_name,
                    len(wifi_observations))

            wifis = defaultdict(list)
            for obs in wifi_observations:
                wifis[WifiObservation.to_hashkey(obs)].append(obs)

            # Create a task per group of 20 WiFi keys at a time.
            # We tend to get a huge number of unique WiFi networks per
            # batch upload, with one to very few observations per WiFi.
            # Grouping them helps in avoiding per-task overhead.
            wifis = list(wifis.values())
            batch_size = 20
            countdown = 0
            for i in range(0, len(wifis), batch_size):
                values = []
                for observations in wifis[i:i + batch_size]:
                    values.extend(observations)
                # insert observations, expire the task if it wasn't processed
                # after six hours to avoid queue overload, also delay
                # each task by one second more, to get a more even workload
                # and avoid parallel updates of the same underlying stations
                self.insert_wifi_task.apply_async(
                    args=[values],
                    kwargs={'userid': userid},
                    expires=21600,
                    countdown=countdown)
                countdown += 1

        if userid is not None:
            scorekey = Score.to_hashkey(
                userid=userid,
                key=ScoreKey.location,
                time=util.utcnow().date())
            Score.incr(self.session, scorekey, len(positions))
        if positions:
            self.process_mapstat(positions)