def process_item(self, item, spider): new_item = item.copy() # flatten location try: new_item['start_time'] = datetime.datetime.strftime( new_item['start_time'], '%Y-%m-%d %H:%M') except: pass try: new_item['end_time'] = datetime.datetime.strftime( new_item['end_time'], '%Y-%m-%d %H:%M') except: pass new_item['location_url'] = get_key(new_item, 'location.url') new_item['location_name'] = get_key(new_item, 'location.name') new_item['location_address'] = get_key(new_item, 'location.address') new_item['source_url'] = new_item.get('sources', [{ 'url': '' }])[0].get('url', '') new_item['source_note'] = new_item.get('sources', [{ 'note': '' }])[0].get('note', '') new_item['agency_name'] = spider.agency_name new_item['scraped_time'] = datetime.datetime.strftime( datetime.datetime.strptime(self.stamp, '%Y%m%d_%H%M'), '%Y-%m-%d %H:%M') new_item = { k: self._format_values(k, v) for k, v in new_item.items() if k in self.exporter.fields_to_export } self.exporter.export_item(new_item) return new_item
def process_item(self, item, spider): # copy item; airtable-specific munging is happening here that breaks # opencivicdata standard if item.get('start_time') is None: spider.logger.debug( 'AIRTABLE PIPELINE: Ignoring event without start_time {0}'. format(item['id'])) return item dt = item['start_time'] if dt < datetime.datetime.now(dt.tzinfo): spider.logger.debug( 'AIRTABLE PIPELINE: Ignoring past event {0}'.format( item['id'])) return item time.sleep(randint(0, 3)) # to avoid rate limiting? new_item = item.copy() # flatten location new_item['location_url'] = get_key(new_item, 'location.url') new_item['location_name'] = get_key(new_item, 'location.name') new_item['location_address'] = get_key(new_item, 'location.address') new_item['location_latitude'] = get_key( new_item, 'location.coordinates.latitude') new_item['location_longitude'] = get_key( new_item, 'location.coordinates.longitude') new_item['agency_name'] = spider.long_name new_item['url'] = new_item.get('sources', [{ 'url': '' }])[0].get('url', '') new_item = { k: self._format_values(k, v) for k, v in new_item.items() if k in KEEP_FIELDS } try: self.save_item(new_item, spider) return item except HTTPError as e: spider.logger.error('HTTP error') spider.logger.error(e.response.content) spider.logger.exception('Original message') spider.logger.error(json.dumps(new_item, indent=4, sort_keys=True)) raise DropItem('Could not save {0}'.format(new_item['id'])) except Exception as e: spider.logger.exception('Unknown error')