class GeocoderPipeline(object): def __init__(self, session=None): if session is None: session = requests.Session() self.session = session self.client = MapzenAPI(os.environ.get('MAPZEN_API_KEY')) def process_item(self, item, spider): """ Performs geocoding of an event if it doesn't already have coordinates. """ try: location = item['location'].get('address') or item['location']['name'] geocode = self.client.search(location, boundary_country='US', format='keys') coordinates = geocode['features'][0]['geometry']['coordinates'] item['location']['coordinates'] = { 'longitude': str(coordinates[0]), 'latitude': str(coordinates[1]), } item['geocode'] = json.dumps(geocode, indent=4, sort_keys=True) except ValueError: spider.logger.warn('Could not geocode {0}-{1}, skipping.'.format(spider.name, item['id'])) except Exception: spider.logger.exception('Unknown error when geocoding, skipping. Message:') spider.logger.error(json.dumps(item, indent=4, sort_keys=True)) return item
class GeocoderPipeline(object): def __init__(self, session=None): if session is None: session = requests.Session() self.session = session self.client = MapzenAPI(os.environ.get('MAPZEN_API_KEY')) self.geocode_database = Airtable(AIRTABLE_BASE_KEY, AIRTABLE_GEOCODE_TABLE) def process_item(self, item, spider): """ Geocodes an item by: (1) looking in airtable cache (2) making a mapzen query and adding the result to the cache if (1) is not found Mapzen queries are standardized to end with ', Chicago, IL'. If something like '5100 Milwaukee Chicago, IL' is not found, '5100 Milwaukee Ave., Chicago, IL' and '5100 Milwaukee St., Chicago, IL' are also tried. """ # skip geocoding if event is in the past if item.get('start_time') is None: spider.logger.debug( 'GEOCODER PIPELINE: Ignoring event without start_time {0}'. format(item['id'])) return item dt = item['start_time'] if dt < datetime.datetime.now(dt.tzinfo): spider.logger.debug( 'GEOCODER PIPELINE: Ignoring past event {0}'.format( item['id'])) return item query = self._get_mapzen_query(item.get('location', {})) if not query: spider.logger.debug( 'GEOCODER PIPELINE: Empty query. Not geocoding {0}'.format( item['id'])) return item for suffix in ['', ' ave.', ' st.']: new_query = query.replace(', chicago, il', '{0}, chicago, il'.format(suffix)) time.sleep(randint(0, 3)) # to avoid rate limiting? updated_item = self._update_fromDB(new_query, item) if updated_item: spider.logger.debug( 'GEOCODER PIPELINE: Geocoded item from airtable cache.') return updated_item bad_addresses = ['Chicago, IL, USA', 'Illinois, USA', ''] for suffix in ['', ' ave.', ' st.']: new_query = query.replace(', chicago, il', '{0}, chicago, il'.format(suffix)) geocoded_item = self._geocode(new_query, item, spider) address = geocoded_item['location']['address'] if (address not in bad_addresses) and (address.endswith( 'Chicago, IL, USA')) and (self._hasDigit(address)): write_item = { 'mapzen_query': new_query, 'longitude': geocoded_item['location']['coordinates']['longitude'], 'latitude': geocoded_item['location']['coordinates']['latitude'], 'name': geocoded_item['location']['name'], 'address': geocoded_item['location']['address'], 'geocode': geocoded_item['geocode'], 'community_area': geocoded_item['community_area'] } self._geocodeDB_write(spider, write_item) spider.logger.debug( 'GEOCODER PIPELINE: Geocoded item from mapzen.') return geocoded_item spider.logger.exception(( "GEOCODER PIPELINE: Couldn't geocode using mapzen or airtable cache. " "Query: {0}. Item id: {1}").format(query, item['id'])) return item def _geocode(self, query, item, spider): """ Makes a Mapzen query and returns results. """ try: geocode = self.client.search(query, boundary_country='US', format='keys') except ValueError: spider.logger.debug( ('GEOCODER PIPELINE: Could not geocode, skipping. ' 'Query: {0}. Item id: {1}').format(query, item['id'])) except Exception as e: spider.logger.info( ('GEOCODER PIPELINE: Unknown error when geocoding, skipping. ' 'Query: {0}. Item id: {1}. Message: {2}').format( query, item['id'], str(e))) else: new_data = { 'location': { 'coordinates': { 'longitude': str(geocode['features'][0]['geometry']['coordinates'] [0]), 'latitude': str(geocode['features'][0]['geometry']['coordinates'] [1]) }, 'name': geocode['geocoding']['query']['parsed_text'].get( 'query', ''), 'address': geocode['features'][0]['properties']['label'], 'url': item.get('location', { 'url': '' }).get('url', '') }, 'geocode': json.dumps(geocode, indent=4, sort_keys=True), 'community_area': geocode['features'][0]['properties'].get('neighbourhood', '') } geocoded_item = item.copy() geocoded_item.update(new_data) return geocoded_item return {'location': {'address': ''}} def _hasDigit(self, string): """ Returns True if the string contains a digit. """ return any(char.isdigit() for char in string) def _get_mapzen_query(self, location_dict): """ Clean and item's location to make a mapzen query. All cleaned queries are lowercase and end with ', chicago, il'. """ name = location_dict.get('name', '').strip() address = location_dict.get('address', '').strip() query = ', '.join([ name, address ]).strip(', ').lower() # combine '{name}, {address}' and lowercase query = query.replace('-', ' ').replace('/', ' ') # remove special characters query = query.replace('milwukee', 'milwaukee').replace( 'milwuakee', 'milwaukee') # fix misspellings query = query.replace('n.', 'n. ').replace('s.', 's. ').replace( 'e.', 'e. ').replace('w.', 'w. ') query = re.sub(r' +', ' ', query) # remove repeated spaces query = re.sub(r',* chicago,*( il)* *\d*$', ', chicago, il', query) # remove zip code, standardize ', chicago, il' if not query: return '' if 'city hall' in query.lower(): return 'chicago city hall, chicago, il' if not query.endswith(', chicago, il'): return '{0}, chicago, il'.format(query) else: return query def _update_fromDB(self, query, item): """ Query the geocode database and update item with results. """ fetched_item = self._geocodeDB_fetch(query) try: new_data = { 'location': { 'coordinates': { 'longitude': str(fetched_item['longitude']), 'latitude': str(fetched_item['latitude']) }, 'name': fetched_item.get('name', ''), 'address': fetched_item['address'], 'url': item.get('location', { 'url': '' }).get('url', '') }, 'geocode': str(fetched_item.get('geocode', '')), 'community_area': fetched_item.get('community_area', '') } except: return {} else: updated_item = item.copy() updated_item.update(new_data) return updated_item def _geocodeDB_fetch(self, query): """ Fetch from geocode_database. """ try: return self.geocode_database.match('mapzen_query', query)['fields'] except: return None def _geocodeDB_write(self, spider, item): """ Write to geocode_database. """ spider.logger.debug('GEOCODER PIPELINE: Caching {0}'.format( item['mapzen_query'])) item['geocode_date_updated'] = datetime.datetime.now().isoformat() airtable_item = self.geocode_database.match('mapzen_query', item['mapzen_query']) if airtable_item: self.geocode_database.update_by_field('mapzen_query', item['mapzen_query'], item) else: self.geocode_database.insert(item)