def post_import(self): # iterate self.missing_stations + insert # points are missing and we have no postcodes to geocode self.stations = StationSet() for record in self.missing_stations: address_parts = self.split_address(record[2]) self.add_polling_station({ 'internal_council_id': record[1], 'postcode': address_parts['postcode'], 'address': address_parts['address'], 'location': None, 'council': self.council }) self.stations.save() """ This data isn't great – the polygons seem to be corrupt in some way. PostGIS can fix them though! """ print("running fixup SQL") table_name = PollingDistrict()._meta.db_table cursor = connection.cursor() cursor.execute(""" UPDATE {0} SET area=ST_Multi(ST_CollectionExtract(ST_MakeValid(area), 3)) WHERE NOT ST_IsValid(area); """.format(table_name))
class BaseGenericApiImporter(BaseStationsDistrictsImporter): srid = 4326 districts_srid = 4326 districts_name = None districts_url = None stations_name = None stations_url = None local_files = False def import_data(self): self.districts = DistrictSet() self.stations = StationSet() # deal with 'stations only' or 'districts only' data if self.districts_url is not None: self.import_polling_districts() if self.stations_url is not None: self.import_polling_stations() self.districts.save() self.stations.save() def get_districts(self): with tempfile.NamedTemporaryFile() as tmp: req = urllib.request.urlretrieve(self.districts_url, tmp.name) return self.get_data(self.districts_filetype, tmp.name) def get_stations(self): with tempfile.NamedTemporaryFile() as tmp: req = urllib.request.urlretrieve(self.stations_url, tmp.name) return self.get_data(self.stations_filetype, tmp.name)
class BaseStationsAddressesImporter(BaseStationsImporter, BaseAddressesImporter): def import_data(self): self.stations = StationSet() self.addresses = AddressSet(self.logger) self.import_residential_addresses() self.import_polling_stations() self.addresses.save(self.batch_size) self.stations.save()
class BaseStationsDistrictsImporter(BaseStationsImporter, BaseDistrictsImporter): def import_data(self): self.stations = StationSet() self.districts = DistrictSet() self.import_polling_districts() self.import_polling_stations() self.districts.save() self.stations.save()
def post_import(self): # mop up any districts where we have a station address # attached to a district code but no point self.stations = StationSet() for code in self.station_addresses: self.add_polling_station({ "internal_council_id": code, "postcode": "", "address": self.station_addresses[code], "location": None, "council": self.council, }) self.stations.save()
def post_import(self): # mop up any districts where we have a station address # attached to a district code but no point self.stations = StationSet() for code in self.station_addresses: self.add_polling_station({ 'internal_council_id': code, 'postcode': '', 'address': self.station_addresses[code], 'location': None, 'council': self.council }) self.stations.save()
def import_data(self): # Optional step for pre import tasks try: self.pre_import() except NotImplementedError: pass self.stations = StationSet() self.addresses = AddressSet(self.logger) self.import_residential_addresses() self.import_polling_stations() self.addresses.save(self.batch_size) self.stations.save()
def import_data(self): # Optional step for pre import tasks try: self.pre_import() except NotImplementedError: pass self.stations = StationSet() self.districts = DistrictSet() self.import_polling_districts() self.import_polling_stations() self.districts.save() self.stations.save()
def import_data(self): self.stations = StationSet() self.districts = DistrictSet() self.import_polling_districts() self.import_polling_stations() self.districts.save() self.stations.save()
def post_import(self): # iterate self.missing_stations + insert # points are missing and we have no postcodes to geocode self.stations = StationSet() for record in self.missing_stations: address_parts = self.split_address(record[2]) self.add_polling_station({ 'internal_council_id': record[1], 'postcode' : address_parts['postcode'], 'address' : address_parts['address'], 'location' : None, 'council' : self.council }) self.stations.save() """ This data isn't great – the polygons seem to be corrupt in some way. PostGIS can fix them though! """ print("running fixup SQL") table_name = PollingDistrict()._meta.db_table cursor = connection.cursor() cursor.execute(""" UPDATE {0} SET area=ST_Multi(ST_CollectionExtract(ST_MakeValid(area), 3)) WHERE NOT ST_IsValid(area); """.format(table_name))
def import_data(self): # Optional step for pre import tasks try: self.pre_import() except NotImplementedError: pass self.districts = DistrictSet() self.stations = StationSet() # deal with 'stations only' or 'districts only' data if self.districts_url is not None: self.import_polling_districts() if self.stations_url is not None: self.import_polling_stations() self.districts.save() self.stations.save()
class BaseStationsDistrictsImporter(BaseStationsImporter, BaseDistrictsImporter): def pre_import(self): raise NotImplementedError def import_data(self): # Optional step for pre import tasks try: self.pre_import() except NotImplementedError: pass self.stations = StationSet() self.districts = DistrictSet() self.import_polling_districts() self.import_polling_stations() self.districts.save() self.stations.save()
def import_data(self): # override import_data so we can populate # self.split_districts as a pre-process self.find_split_districts() self.stations = StationSet() self.districts = DistrictSet() self.import_polling_districts() self.import_polling_stations() self.districts.save() self.stations.save()
class BaseGenericApiImporter(BaseStationsDistrictsImporter): srid = 4326 districts_srid = 4326 districts_name = None districts_url = None stations_name = None stations_url = None local_files = False def import_data(self): # Optional step for pre import tasks try: self.pre_import() except NotImplementedError: pass self.districts = DistrictSet() self.stations = StationSet() # deal with 'stations only' or 'districts only' data if self.districts_url is not None: self.import_polling_districts() if self.stations_url is not None: self.import_polling_stations() self.districts.save() self.stations.save() def get_districts(self): with tempfile.NamedTemporaryFile() as tmp: urllib.request.urlretrieve(self.districts_url, tmp.name) return self.get_data(self.districts_filetype, tmp.name) def get_stations(self): with tempfile.NamedTemporaryFile() as tmp: urllib.request.urlretrieve(self.stations_url, tmp.name) return self.get_data(self.stations_filetype, tmp.name)
def import_data(self): self.districts = DistrictSet() self.stations = StationSet() # deal with 'stations only' or 'districts only' data if self.districts_url is not None: self.import_polling_districts() if self.stations_url is not None: self.import_polling_stations() self.districts.save() self.stations.save()
class BaseStationsAddressesImporter(BaseStationsImporter, BaseAddressesImporter): fuzzy_match = True match_threshold = 100 def pre_import(self): raise NotImplementedError def import_data(self): # Optional step for pre import tasks try: self.pre_import() except NotImplementedError: pass self.stations = StationSet() self.addresses = AddressList(self.logger) self.import_residential_addresses() self.import_polling_stations() self.addresses.save(self.batch_size, self.fuzzy_match, self.match_threshold) self.stations.save()
class Command(BaseGitHubImporter): srid = 4326 districts_srid = 4326 council_id = "E07000106" elections = ["parl.2019-12-12"] scraper_name = "wdiv-scrapers/DC-PollingStations-Canterbury" geom_type = "geojson" # Canterbury embed the station addresses in the districts file # The stations endpoint only serves up the geo data # (it doesn't include the station addresses) station_addresses = {} def district_record_to_dict(self, record): poly = self.extract_geometry(record, self.geom_type, self.get_srid("districts")) code = record["ID"].strip() address = record["POLLING_PL"].strip() # Ad-hoc fixs for parl.2019-12-12 # The points got updated in API, but the addresses didn't if code == "CWI2": address = "Thanington Neighbourhood Resource Centre\nThanington Road\nCanterbury\nCT1 3XE" if code == "RCS2": address = ( "Chartham Sports Club\nBeech Avenue\nChartham\nCanterbury\nCT4 7TA" ) if code in self.station_addresses and self.station_addresses[ code] != address: raise ValueError( "District code appears twice with 2 different station addresses" ) self.station_addresses[code] = address return { "internal_council_id": code, "name": record["NAME"].strip() + " - " + code, "area": poly, "polling_station_id": code, } def station_record_to_dict(self, record): code = record["Polling_di"].strip() address = self.station_addresses[code] del self.station_addresses[ code] # remove station addresses as we use them location = self.extract_geometry(record, self.geom_type, self.get_srid("stations")) if isinstance(location, MultiPoint) and len(location) == 1: location = location[0] # point supplied is bang on the building # but causes google directions API to give us a strange route if code == "CWE2" and address.startswith("St Dunstan"): location = Point(1.070064, 51.283614, srid=4326) return { "internal_council_id": code, "postcode": "", "address": address, "location": location, } def post_import(self): # mop up any districts where we have a station address # attached to a district code but no point self.stations = StationSet() for code in self.station_addresses: self.add_polling_station({ "internal_council_id": code, "postcode": "", "address": self.station_addresses[code], "location": None, "council": self.council, }) self.stations.save()
class Command(BaseCsvStationsShpDistrictsImporter): """ Imports the Polling Station data from Calderdale """ council_id = 'E08000033' districts_name = 'polling_districts' stations_name = 'Polling Stations.csv' elections = [ 'pcc.2016-05-05', 'ref.2016-06-23' ] missing_stations = [] def get_station_hash(self, record): return "-".join([ record.address, record.polling_district, record.ward, record.easting, record.northing, ]) def import_polling_districts(self): sf = shapefile.Reader("{0}/{1}".format( self.base_folder_path, self.districts_name )) for district in sf.shapeRecords(): district_info = self.district_record_to_dict(district.record) if 'council' not in district_info: district_info['council'] = self.council geojson = json.dumps(district.shape.__geo_interface__) poly = self.clean_poly(GEOSGeometry(geojson, srid=self.get_srid('districts'))) """ File contains 2 districts with the code DC. One of them covers a distinct area not covered by another district. The other exactly contains districts DD and DE. I've assumed that the one covering a distinct area is 'correct' (i.e: A property may not be in 2 districts simultaneously). Discard the other district DC. """ if district.record[1] == 'DC' and poly.length == 16675.9905799729: pass else: district_info['area'] = poly self.add_polling_district(district_info) def district_record_to_dict(self, record): """ Districts BB and BC don't appear in the stations file but the station addresses are embedded in the districts file. Save them for later. """ if record[1] == 'BB' or record[1] == 'BC': self.missing_stations.append(record) return { 'internal_council_id': record[1], 'name' : "%s - %s" % (record[0], record[1]), 'polling_station_id' : record[1] } def split_address(self, in_address): address_parts = in_address.replace('.', '').split(", ") if (len(address_parts[-1]) == 7 or len(address_parts[-1]) == 8) and address_parts[-1] != 'Halifax': out_address = "\n".join(address_parts[:-1]) postcode = address_parts[-1] else: out_address = "\n".join(address_parts) postcode = '' return { 'address' : out_address, 'postcode' : postcode } def station_record_to_dict(self, record): # discard the rows with no district id/address if not record.polling_district: return None location = Point(float(record.easting), float(record.northing), srid=self.get_srid()) address_parts = self.split_address(record.address) return { 'internal_council_id': record.polling_district, 'postcode' : address_parts['postcode'], 'address' : address_parts['address'], 'location' : location } def post_import(self): # iterate self.missing_stations + insert # points are missing and we have no postcodes to geocode self.stations = StationSet() for record in self.missing_stations: address_parts = self.split_address(record[2]) self.add_polling_station({ 'internal_council_id': record[1], 'postcode' : address_parts['postcode'], 'address' : address_parts['address'], 'location' : None, 'council' : self.council }) self.stations.save() """ This data isn't great – the polygons seem to be corrupt in some way. PostGIS can fix them though! """ print("running fixup SQL") table_name = PollingDistrict()._meta.db_table cursor = connection.cursor() cursor.execute(""" UPDATE {0} SET area=ST_Multi(ST_CollectionExtract(ST_MakeValid(area), 3)) WHERE NOT ST_IsValid(area); """.format(table_name))
class Command(BaseGitHubImporter): srid = 4326 districts_srid = 4326 council_id = "E07000106" elections = ["local.2019-05-02"] scraper_name = "wdiv-scrapers/DC-PollingStations-Canterbury" geom_type = "geojson" # Canterbury embed the station addresses in the districts file # The stations endpoint only serves up the geo data # (it doesn't include the station addresses) station_addresses = {} def district_record_to_dict(self, record): poly = self.extract_geometry(record, self.geom_type, self.get_srid("districts")) code = record["ID"].strip() address = record["POLLING_PL"].strip() if code in self.station_addresses and self.station_addresses[ code] != address: raise ValueError( "District code appears twice with 2 different station addresses" ) self.station_addresses[code] = address return { "internal_council_id": code, "name": record["NAME"].strip() + " - " + code, "area": poly, "polling_station_id": code, } def station_record_to_dict(self, record): code = record["Polling_di"].strip() address = self.station_addresses[code] del self.station_addresses[ code] # remove station addresses as we use them location = self.extract_geometry(record, self.geom_type, self.get_srid("stations")) if isinstance(location, MultiPoint) and len(location) == 1: location = location[0] return { "internal_council_id": code, "postcode": "", "address": address, "location": location, } def post_import(self): # mop up any districts where we have a station address # attached to a district code but no point self.stations = StationSet() for code in self.station_addresses: self.add_polling_station({ "internal_council_id": code, "postcode": "", "address": self.station_addresses[code], "location": None, "council": self.council, }) self.stations.save()
class Command(BaseGitHubImporter): srid = 4326 districts_srid = 4326 council_id = "E07000106" elections = ["parl.2017-06-08"] scraper_name = "wdiv-scrapers/DC-PollingStations-Canterbury" geom_type = "geojson" # Canterbury embed the station addresses in the districts file # The stations endpoint only serves up the geo data # (it doesn't include the station addresses) station_addresses = {} def district_record_to_dict(self, record): poly = self.extract_geometry(record, self.geom_type, self.get_srid("districts")) code = record["ID"].strip() address = record["POLLING_PL"].strip() if code in self.station_addresses and self.station_addresses[ code] != address: raise ValueError( "District code appears twice with 2 different station addresses" ) self.station_addresses[code] = address return { "internal_council_id": code, "name": record["NAME"].strip() + " - " + code, "area": poly, "polling_station_id": code, } def extract_json_point(self, record, srid): geom = json.loads(record["geometry"]) # if geometry object is a MultiPoint with only one Point in it, convert it to a Point if (geom["geometry"]["type"] == "MultiPoint" and len(geom["geometry"]["coordinates"]) == 1): geom["geometry"]["type"] = "Point" geom["geometry"]["coordinates"] = geom["geometry"]["coordinates"][ 0] geojson = json.dumps(geom["geometry"]) return self.clean_poly(GEOSGeometry(geojson, srid=srid)) def station_record_to_dict(self, record): code = record["Polling_di"].strip() address = self.station_addresses[code] del (self.station_addresses[code] ) # remove station addresses as we use them location = self.extract_json_point(record, self.get_srid("stations")) return { "internal_council_id": code, "postcode": "", "address": address, "location": location, } def post_import(self): # mop up any districts where we have a station address # attached to a district code but no point self.stations = StationSet() for code in self.station_addresses: self.add_polling_station({ "internal_council_id": code, "postcode": "", "address": self.station_addresses[code], "location": None, "council": self.council, }) self.stations.save()
class Command(BaseMorphApiImporter): srid = 4326 districts_srid = 4326 council_id = 'E07000106' elections = ['local.kent.2017-05-04'] scraper_name = 'wdiv-scrapers/DC-PollingStations-Canterbury' geom_type = 'geojson' # Canterbury embed the station addresses in the districts file # The stations endpoint only serves up the geo data # (it doesn't include the station addresses) station_addresses = {} def district_record_to_dict(self, record): poly = self.extract_geometry(record, self.geom_type, self.get_srid('districts')) code = record['ID'].strip() address = record['POLLING_PL'].strip() if code in self.station_addresses and self.station_addresses[ code] != address: raise ValueError( 'District code appears twice with 2 different station addresses' ) self.station_addresses[code] = address return { 'internal_council_id': code, 'name': record['NAME'].strip() + ' - ' + code, 'area': poly, 'polling_station_id': code, } def extract_json_point(self, record, srid): geom = json.loads(record['geometry']) # if geometry object is a MultiPoint with only one Point in it, convert it to a Point if geom['geometry']['type'] == 'MultiPoint' and len( geom['geometry']['coordinates']) == 1: geom['geometry']['type'] = 'Point' geom['geometry']['coordinates'] = geom['geometry']['coordinates'][ 0] geojson = json.dumps(geom['geometry']) return self.clean_poly(GEOSGeometry(geojson, srid=srid)) def station_record_to_dict(self, record): code = record['Polling_di'].strip() address = self.station_addresses[code] del (self.station_addresses[code] ) # remove station addresses as we use them location = self.extract_json_point(record, self.get_srid('stations')) return { 'internal_council_id': code, 'postcode': '', 'address': address, 'location': location, } def post_import(self): # mop up any districts where we have a station address # attached to a district code but no point self.stations = StationSet() for code in self.station_addresses: self.add_polling_station({ 'internal_council_id': code, 'postcode': '', 'address': self.station_addresses[code], 'location': None, 'council': self.council }) self.stations.save()
class Command(BaseCsvStationsShpDistrictsImporter): """ Imports the Polling Station data from Calderdale """ council_id = 'E08000033' districts_name = 'polling_districts' stations_name = 'Polling Stations.csv' elections = ['pcc.2016-05-05', 'ref.2016-06-23'] missing_stations = [] def get_station_hash(self, record): return "-".join([ record.address, record.polling_district, record.ward, record.easting, record.northing, ]) def import_polling_districts(self): sf = shapefile.Reader("{0}/{1}".format(self.base_folder_path, self.districts_name)) for district in sf.shapeRecords(): district_info = self.district_record_to_dict(district.record) if 'council' not in district_info: district_info['council'] = self.council geojson = json.dumps(district.shape.__geo_interface__) poly = self.clean_poly( GEOSGeometry(geojson, srid=self.get_srid('districts'))) """ File contains 2 districts with the code DC. One of them covers a distinct area not covered by another district. The other exactly contains districts DD and DE. I've assumed that the one covering a distinct area is 'correct' (i.e: A property may not be in 2 districts simultaneously). Discard the other district DC. """ if district.record[1] == 'DC' and poly.length == 16675.9905799729: pass else: district_info['area'] = poly self.add_polling_district(district_info) def district_record_to_dict(self, record): """ Districts BB and BC don't appear in the stations file but the station addresses are embedded in the districts file. Save them for later. """ if record[1] == 'BB' or record[1] == 'BC': self.missing_stations.append(record) return { 'internal_council_id': record[1], 'name': "%s - %s" % (record[0], record[1]), 'polling_station_id': record[1] } def split_address(self, in_address): address_parts = in_address.replace('.', '').split(", ") if (len(address_parts[-1]) == 7 or len(address_parts[-1]) == 8) and address_parts[-1] != 'Halifax': out_address = "\n".join(address_parts[:-1]) postcode = address_parts[-1] else: out_address = "\n".join(address_parts) postcode = '' return {'address': out_address, 'postcode': postcode} def station_record_to_dict(self, record): # discard the rows with no district id/address if not record.polling_district: return None location = Point(float(record.easting), float(record.northing), srid=self.get_srid()) address_parts = self.split_address(record.address) return { 'internal_council_id': record.polling_district, 'postcode': address_parts['postcode'], 'address': address_parts['address'], 'location': location } def post_import(self): # iterate self.missing_stations + insert # points are missing and we have no postcodes to geocode self.stations = StationSet() for record in self.missing_stations: address_parts = self.split_address(record[2]) self.add_polling_station({ 'internal_council_id': record[1], 'postcode': address_parts['postcode'], 'address': address_parts['address'], 'location': None, 'council': self.council }) self.stations.save() """ This data isn't great – the polygons seem to be corrupt in some way. PostGIS can fix them though! """ print("running fixup SQL") table_name = PollingDistrict()._meta.db_table cursor = connection.cursor() cursor.execute(""" UPDATE {0} SET area=ST_Multi(ST_CollectionExtract(ST_MakeValid(area), 3)) WHERE NOT ST_IsValid(area); """.format(table_name))
class Command(BaseMorphApiImporter): srid = 4326 districts_srid = 4326 council_id = 'E07000228' elections = ['local.west-sussex.2017-05-04'] scraper_name = 'wdiv-scrapers/DC-PollingStations-Mid-Sussex' geom_type = 'geojson' split_districts = set() def get_station_hash(self, record): # handle exact dupes on code/address return "-".join([ record['msercode'], record['uprn'] ]) def find_split_districts(self): # identify any district codes which appear more than once # with 2 different polling station addresses. # We do not want to import these. stations = self.get_stations() for station1 in stations: for station2 in stations: if (station2['msercode'] == station1['msercode'] and\ station2['uprn'] != station1['uprn']): self.split_districts.add(station1['msercode']) def district_record_to_dict(self, record): poly = self.extract_geometry(record, self.geom_type, self.get_srid('districts')) return { 'internal_council_id': record['msercode'], 'name' : record['boundname'], 'area' : poly, 'polling_station_id' : record['msercode'], } def station_record_to_dict(self, record): # handle split districts if record['msercode'] in self.split_districts: return None location = self.extract_geometry(record, self.geom_type, self.get_srid('stations')) return { 'internal_council_id': record['msercode'], 'postcode': '', 'address': record['address'], 'location': location, } def import_data(self): # override import_data so we can populate # self.split_districts as a pre-process self.find_split_districts() self.stations = StationSet() self.districts = DistrictSet() self.import_polling_districts() self.import_polling_stations() self.districts.save() self.stations.save()