def test_outdated_station(self, session, redis_client, cellarea_queue): """An older statuon record does not update existing station records.""" station_data = { "radio": Radio.wcdma, "mcc": 202, "mnc": 1, "lac": 2120, "cid": 12842, "lat": 38.85, "lon": 23.41, "radius": 1, "samples": 1, "created": datetime(2019, 1, 1, tzinfo=UTC), "modified": datetime(2019, 10, 7, tzinfo=UTC), } station = CellShard.create(_raise_invalid=True, **station_data) session.add(station) session.flush() csv = StringIO("""\ radio,mcc,net,area,cell,unit,lon,lat,range,samples,changeable,created,updated,averageSignal UMTS,202,1,2120,12842,,23.4123167,38.8574351,0,6,1,1568220564,1570120316, """) read_stations_from_csv(session, csv, redis_client, cellarea_queue) # The existing station is unmodified wcdma = session.query(CellShard.shard_model(Radio.wcdma)).one() assert wcdma.lat == 38.85 assert wcdma.lon == 23.41 assert wcdma.created == datetime(2019, 1, 1, tzinfo=UTC) assert wcdma.modified == datetime(2019, 10, 7, tzinfo=UTC) # No CellAreas or RegionStats are generated assert session.query(func.count(CellArea.areaid)).scalar() == 0 assert session.query(func.count(RegionStat.region)).scalar() == 0
def test_modified_station(self, session, redis_client, cellarea_queue): """A modified station updates existing records.""" station_data = { "radio": Radio.umts, "mcc": 202, "mnc": 1, "lac": 2120, "cid": 12842, "lat": 38.85, "lon": 23.41, "min_lat": 38.7, "max_lat": 38.9, "min_lon": 23.4, "max_lon": 23.5, "radius": 1, "samples": 1, "created": datetime(2019, 1, 1, tzinfo=UTC), "modified": datetime(2019, 1, 1, tzinfo=UTC), } station = CellShard.create(_raise_invalid=True, **station_data) session.add(station) session.flush() csv = StringIO( """\ radio,mcc,net,area,cell,unit,lon,lat,range,samples,changeable,created,updated,averageSignal UMTS,202,1,2120,12842,,23.4123167,38.8574351,0,6,1,1568220564,1570120316, """ ) read_stations_from_csv(session, csv, redis_client, cellarea_queue) # Check the details of the UMTS station umts = session.query(CellShard.shard_model(Radio.umts)).one() # New position, other details from import assert umts.lat == 38.8574351 assert umts.lon == 23.4123167 assert umts.radius == 0 assert umts.samples == 6 assert umts.created == datetime(2019, 9, 11, 16, 49, 24, tzinfo=UTC) assert umts.modified == datetime(2019, 10, 3, 16, 31, 56, tzinfo=UTC) # Other details unchanged assert umts.max_lat == station_data["max_lat"] assert umts.min_lat == station_data["min_lat"] assert umts.max_lon == station_data["max_lon"] assert umts.min_lon == station_data["min_lon"] assert umts.region == "GR" # A Modified station triggers the creation of a new CellArea cell_area = session.query(CellArea).order_by(CellArea.areaid).one() assert cell_area.areaid == (Radio.wcdma, 202, 1, 2120) # The new CellAreas triggers the creation of a RegionStat stat = session.query(RegionStat).order_by("region").one() assert stat.region == "GR" assert stat.wcdma == 1
def read_stations_from_csv(session, file_handle, redis_client, cellarea_queue): """ Read stations from a public cell export CSV. :arg session: a database session :arg file_handle: an open file handle for the CSV data :arg redis_client: a Redis client :arg cellarea_queue: the DataQueue for updating cellarea IDs """ # Avoid circular imports from ichnaea.data.tasks import update_cellarea, update_statregion csv_content = peekable(reader(file_handle)) # UMTS was the original name for WCDMA stations radio_type = {"UMTS": "wcdma", "GSM": "gsm", "LTE": "lte", "": "Unknown"} counts = defaultdict(Counter) areas = set() areas_total = 0 total = 0 if not csv_content: LOGGER.warning("Nothing to process.") return first_row = csv_content.peek() if first_row == _FIELD_NAMES: # Skip the first row because it's a header row next(csv_content) else: LOGGER.warning("Expected header row, got data: %s", first_row) for row in csv_content: try: radio = radio_type[row[0]] except KeyError: raise InvalidCSV("Unknown radio type in row: %s" % row) if radio == "Unknown": LOGGER.warning("Skipping unknown radio: %s", row) continue try: data = { "radio": radio, "mcc": int(row[1]), "mnc": int(row[2]), "lac": int(row[3]), "cid": int(row[4]), "psc": int(row[5]) if row[5] else 0, "lon": float(row[6]), "lat": float(row[7]), # Some exported radiuses exceed the max and fail validation "radius": min(int(row[8]), CELL_MAX_RADIUS), "samples": int(row[9]), # row[10] is "changable", always 1 and not imported "created": datetime.fromtimestamp(int(row[11]), UTC), "modified": datetime.fromtimestamp(int(row[12]), UTC), } shard = CellShard.create(_raise_invalid=True, **data) except (colander.Invalid, ValueError) as e: if total == 0: # If the first row is invalid, it's likely the rest of the # file is, too--drop out here. raise InvalidCSV("first row %s is invalid: %s" % (row, e)) else: LOGGER.warning("row %s is invalid: %s", row, e) continue # Is this station in the database? shard_type = shard.__class__ existing = (session.query(shard_type).filter( shard_type.cellid == shard.cellid).options( load_only("modified")).one_or_none()) if existing: if existing.modified < data["modified"]: # Update existing station with new data operation = "updated" existing.psc = shard.psc existing.lon = shard.lon existing.lat = shard.lat existing.radius = shard.radius existing.samples = shard.samples existing.created = shard.created existing.modified = shard.modified else: # Do nothing to existing station record operation = "found" else: # Add a new station record operation = "new" shard.min_lat = shard.lat shard.max_lat = shard.lat shard.min_lon = shard.lon shard.max_lon = shard.lon session.add(shard) counts[data["radio"]][operation] += 1 # Process the cell area? if operation in {"new", "updated"}: areas.add(area_id(shard)) # Process a chunk of stations, report on progress total += 1 if total % 1000 == 0: session.commit() LOGGER.info("Processed %d stations", total) if areas and (len(areas) % 1000 == 0): session.commit() areas_total += len(areas) LOGGER.info("Processed %d station areas", areas_total) with redis_pipeline(redis_client) as pipe: cellarea_queue.enqueue(list(areas), pipe=pipe) update_cellarea.delay() areas = set() # Commit remaining station data session.commit() # Update the remaining cell areas if areas: areas_total += len(areas) with redis_pipeline(redis_client) as pipe: cellarea_queue.enqueue(list(areas), pipe=pipe) update_cellarea.delay() # Now that we've updated all the cell areas, we need to update the # statregion update_statregion.delay() # Summarize results LOGGER.info("Complete, processed %d station%s:", total, "" if total == 1 else "s") for radio_type, op_counts in sorted(counts.items()): LOGGER.info( " %s: %d new, %d updated, %d already loaded", radio_type, op_counts["new"], op_counts["updated"], op_counts["found"], ) if areas_total: LOGGER.info(" %d station area%s updated", areas_total, "" if areas_total == 1 else "s")