def download_state_shapefile(state, zipcodes): print "Starting download" n = dict(CENSUS_STATES)[state].upper().replace(' ', '_') name = "tl_2009_%s_zcta5" % state zip_filename = "%s.zip" % name cache_dir = getattr(settings, 'HTTP_CACHE', tempfile.gettempdir()) path = os.path.join(cache_dir, zip_filename) url = "http://tigerline.census.gov/geo/tiger/TIGER2009/%(id)s_%(name)s/%(zip_filename)s" % { 'id': state, 'name': n, 'zip_filename': zip_filename } Retriever().cached_get_to_file(url, path) print "fetched %s" % url files = [name + ext for ext in ('.shp', '.dbf', '.prj', '.shp.xml', '.shx')] shapefile = os.path.join(cache_dir, '%s.shp' % name) # TODO: handle corrupt/incomplete/missing files zipfile # expected files aren't in the archive ...) try: ZipFile(path, 'r').extractall(cache_dir, files) print "extracted" except: log_exception() return for zipcode in zipcodes: print "importing %s" % zipcode import_zip_from_shapefile(shapefile, zipcode) print "... ok" print "All zip codes done"
def download_state_shapefile(state, zipcodes): print "Starting download" n = dict(CENSUS_STATES)[state].upper().replace(' ', '_') name = "tl_2009_%s_zcta5" % state zip_filename = "%s.zip" % name cache_dir = getattr(settings, 'HTTP_CACHE', tempfile.gettempdir()) path = os.path.join(cache_dir, zip_filename) url = "http://tigerline.census.gov/geo/tiger/TIGER2009/%(id)s_%(name)s/%(zip_filename)s" % { 'id': state, 'name': n, 'zip_filename': zip_filename } Retriever().cached_get_to_file(url, path) print "fetched %s" % url files = [ name + ext for ext in ('.shp', '.dbf', '.prj', '.shp.xml', '.shx') ] shapefile = os.path.join(cache_dir, '%s.shp' % name) # TODO: handle corrupt/incomplete/missing files zipfile # expected files aren't in the archive ...) try: ZipFile(path, 'r').extractall(cache_dir, files) print "extracted" except: log_exception() return for zipcode in zipcodes: print "importing %s" % zipcode import_zip_from_shapefile(shapefile, zipcode) print "... ok" print "All zip codes done"
def import_zip_from_shapefile(filename, zipcode): layer = layer_from_shapefile(filename, 0) importer = ZipImporter(layer, 'ZCTA5CE') try: importer.import_zip(zipcode) except: log_exception() return
def save(self, old_record, list_record, detail_record): # This gets called once all parsing and cleanup is done. # It looks a lot like our 'expedient hack' code above. # We can ignore detail_record since has_detail is False. date = datetime.date(*list_record['updated_parsed'][:3]) description = text_from_html(list_record['summary']) # This feed doesn't provide geographic data; we'll try to # extract addresses from the text, and stop on the first # one that successfully geocodes. # First we'll need some suitable text; throw away HTML tags. # full_description = list_record['content'][0]['value'] # full_description = text_from_html(full_description) grabber = places.location_grabber() addrs = grabber(description) # printing articles title for debugging # print list_record['title'] if not addrs: addrs = grabber(list_record['title']) if not addrs: self.logger.info("no addresses found") return location = None location_name = u'' block = None # Ready to geocode. If we had one location_name to try, # this could be done automatically in create_or_update(), but # we have multiple possible location_names. for l, r, name in addrs: #addr = addr.strip() try: locationSyn = LocationSynonym.objects.get(pretty_name = name) location = Location.objects.get(name = locationSyn.location).location except GeocodingException: log_exception(level=logging.DEBUG) continue location_name = name # block = location['block'] # location = location['point'] break if location is None: self.logger.info("no addresses geocoded in %r" % list_record['title']) return kwargs = dict(item_date=date, location=location, location_name=location_name, description=description, title=list_record['title'], url=list_record['link'], ) attributes = None self.create_or_update(old_record, attributes, **kwargs)
def template_context_for_item(newsitem, widget=None): # try to make something ... reasonable for use in # templates. ctx = { 'attributes': [], 'attributes_by_name': {}, '_item': newsitem, # cached in case downstream code really needs it. } for att in newsitem.attributes_for_template(): attr = { 'name': att.sf.name, 'title': att.sf.smart_pretty_name(), 'display': att.sf.display } vals = [x['value'] for x in att.value_list()] if len(vals) == 1: attr['value'] = vals[0] attr['is_list'] = False else: attr['value'] = vals attr['is_list'] = True ctx['attributes'].append(attr) ctx['attributes_by_name'][att.sf.name] = attr # newsitem fields ctx['id'] = newsitem.id ctx['schema'] = newsitem.schema ctx['title'] = newsitem.title ctx['description'] = newsitem.description ctx['pub_date'] = newsitem.pub_date ctx['item_date'] = newsitem.item_date ctx['location'] = {} if newsitem.location: # TODO: Is centroid really what we want for non-Point geometries? ctx['location']['lon'] = newsitem.location.centroid.x ctx['location']['lat'] = newsitem.location.centroid.y ctx['location']['geom'] = newsitem.location ctx['location']['name'] = newsitem.location_name ctx['external_url'] = newsitem.url if newsitem.schema.has_newsitem_detail: ctx['internal_url'] = 'http://' + settings.EB_DOMAIN + newsitem.item_url() if widget is not None: if widget.item_link_template and widget.item_link_template.strip(): try: ctx['internal_url'] = _eval_item_link_template(widget.item_link_template, {'item': ctx, 'widget': widget}) except: log_exception() # TODO: some sort of error handling return '#error' return ctx
def unzip(filename, cwd=None): """Unzip filename, write extracted files into cwd (default is the current dir). """ try: zfile = zipfile.ZipFile(filename) zfile.extractall(path=cwd) return True except: log_exception() return False
def makedirs(path): """Emulates the `mkdir -p` shell command. """ if os.path.exists(path): return True try: os.makedirs(path) return True except: log_exception() return False
def update(self): """ Download Calendar RSS feed and update database """ logger.info("Starting ObituaryScraper") feed = feedparser.parse(self.url) total_created = 0 for entry in feed.entries: title = convert_entities(entry.title) try: created = self.parse_entry(entry, title) if created: total_created += 1 except: logger.error("unexpected error:", sys.exc_info()[1]) log_exception() break logger.info("Created %d of %d total" % (created, len(feed.entries)))
def import_blocks_from_shapefiles(edges, featnames, faces, place, city=None, fix_cities=False, regenerate_intersections=True): # File args are paths to zip files. outdir = mkdtemp(suffix='-block-shapefiles') try: for path in (edges, featnames, faces, place): ZipFile(path, 'r').extractall(outdir) except: # TODO: display error in UI log_exception() shutil.rmtree(outdir) raise finally: os.unlink(edges) os.unlink(featnames) os.unlink(faces) os.unlink(place) try: edges = glob.glob(os.path.join(outdir, '*edges.shp'))[0] featnames = glob.glob(os.path.join(outdir, '*featnames.dbf'))[0] faces = glob.glob(os.path.join(outdir, '*faces.dbf'))[0] place = glob.glob(os.path.join(outdir, '*place.shp'))[0] tiger = TigerImporter( edges, featnames, faces, place, filter_city=city, fix_cities=fix_cities, ) num_created = tiger.save() finally: shutil.rmtree(outdir) if regenerate_intersections: populate_streets_task() return num_created
try: place.pretty_name = pretty_name place.address = address place.location = point place.url = place_url place.place_type = place_type place.save() if created: message = 'Created new place %s' % (pretty_name) else: message = 'Updated place %s' % (pretty_name) context['actions_taken'].append(message) except: log_exception() message = 'Error adding place "%s"' % pretty_name context['errors'].append(message) continue # now update Synonyms # destroy synonyms not in the new list, identify new synonyms new_synonyms = set(synonyms) for synonym in PlaceSynonym.objects.filter(place=place).all(): if synonym.pretty_name not in new_synonyms: synonym.delete() message = 'Removing old synonym "%s" for "%s"' % (synonym.pretty_name, pretty_name) context['actions_taken'].append(message) else:
def template_context_for_item(newsitem, widget=None): # try to make something ... reasonable for use in # templates. ctx = { 'attributes': [], 'attributes_by_name': {}, '_item': newsitem, # cached in case downstream code really needs it. } for att in newsitem.attributes_for_template(): attr = { 'name': att.sf.name, 'title': att.sf.smart_pretty_name(), 'display': att.sf.display } vals = [x['value'] for x in att.value_list()] if len(vals) == 1: attr['value'] = vals[0] attr['is_list'] = False else: attr['value'] = vals attr['is_list'] = True ctx['attributes'].append(attr) ctx['attributes_by_name'][att.sf.name] = attr # newsitem fields ctx['id'] = newsitem.id ctx['schema'] = newsitem.schema ctx['title'] = newsitem.title ctx['description'] = newsitem.description ctx['pub_date'] = newsitem.pub_date ctx['item_date'] = newsitem.item_date ctx['location'] = {} if newsitem.location: # TODO: Is centroid really what we want for non-Point geometries? ctx['location']['lon'] = newsitem.location.centroid.x ctx['location']['lat'] = newsitem.location.centroid.y ctx['location']['geom'] = newsitem.location ctx['location']['name'] = newsitem.location_name ctx['external_url'] = newsitem.url if newsitem.schema.has_newsitem_detail: ctx['internal_url'] = 'http://' + settings.EB_DOMAIN + newsitem.item_url( ) if widget is not None: if widget.item_link_template and widget.item_link_template.strip(): try: ctx['internal_url'] = _eval_item_link_template( widget.item_link_template, { 'item': ctx, 'widget': widget }) except: log_exception() # TODO: some sort of error handling return '#error' return ctx
def update(self): logger.info("Starting LocalNewsScraper update %s" % self.url) try: schema = Schema.objects.get(slug=self.schema_slug) except Schema.DoesNotExist: logger.error( "Schema (%s): DoesNotExist" % self.schema_slug) return 1 response, content = self.http.request(self.url) if response.fromcache: logger.info("Feed is unchanged since last update (cached)") return f = feedparser.parse(content) addcount = updatecount = 0 for entry in f.entries: title = convert_entities(entry.title) description = convert_entities(entry.description) if entry.id.startswith('http'): item_url = entry.id else: item_url = entry.link try: item = NewsItem.objects.get(schema__id=schema.id, title=title, description=description) #url=item_url) status = 'updated' except NewsItem.DoesNotExist: item = NewsItem() status = 'added' except NewsItem.MultipleObjectsReturned: # Seen some where we get the same story with multiple URLs. Why? logger.warn("Multiple entries matched title %r and description %r. Expected unique!" % (title, description)) continue try: item.title = title item.schema = schema item.description = description item.url = item_url # Support both georss and xcal for getting the location name. # TODO: should also support ev:location per http://web.resource.org/rss/1.0/modules/event/ item.location_name = entry.get('xCal_x-calconnect-street') or entry.get('x-calconnect-street') or entry.get('georss_featurename') or entry.get('featurename') item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) _short_title = item.title[:30] + '...' # feedparser bug: depending on which parser it magically uses, # we either get the xml namespace in the key name, or we don't. point = entry.get('georss_point') or entry.get('point') x, y = None, None if point: # GeoRSS puts latitude (Y) first. y, x = point.split(' ') else: if item.location_name: text = item.location_name else: # Geocode whatever we can find. text = item.title + ' ' + item.description logger.debug("...Falling back on geocoding from %r..." % text[:50]) addrs = parse_addresses(text) for addr, unused in addrs: try: result = SmartGeocoder().geocode(addr) point = result['point'] logger.debug("internally geocoded %r" % addr) x, y = point.x, point.y if not item.location_name: item.location_name = result['address'] item.block = result['block'] break except GeocodingException: logger.debug("Geocoding exception on %r:" % text) log_exception(level=logging.DEBUG) continue except: logger.error('uncaught geocoder exception on %r\n' % addr) log_exception() if None in (x, y): logger.debug("Skip, couldn't geocode any addresses in item '%s...'" % _short_title) continue item.location = Point((float(x), float(y))) if not intersects_metro_bbox(item.location): reversed_loc = Point((float(y), float(x))) if intersects_metro_bbox(reversed_loc): logger.info( "Got points in apparently reverse order, flipping them") item.location = reversed_loc else: logger.info("Skipping %r as %s,%s is out of bounds" % (_short_title, y, x)) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) logger.debug(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name item.block = block except reverse.ReverseGeocodeError: logger.info(" Skip, failed to reverse geocode %s for %r" % (item.location.wkt, _short_title)) continue item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, _short_title)) except: logger.error("Warning: couldn't save %r. Traceback:" % _short_title) log_exception() logger.info("Finished LocalNewsScraper update: %d added, %d updated" % (addcount, updatecount))
def update(self): """ Download Calendar RSS feed and update database """ logger.info("Starting EventsCalendarScraper") feed = feedparser.parse(self.url) seencount = addcount = updatecount = 0 for entry in feed.entries: def ns_get(element): # work around feedparser unpredictability. namespace, element = element.split(':') result = entry.get('%s_%s' % (namespace, element)) if result is None: result = entry.get(element) return result seencount += 1 title = convert_entities(entry.title) try: item = NewsItem.objects.get(title=title, schema__id=self.schema.id) status = "updated" except NewsItem.DoesNotExist: item = NewsItem() status = "added" except NewsItem.MultipleObjectsReturned: logger.warn("Multiple entries matched title %r, event titles are not unique?" % title) continue try: item.location_name = '%s %s' % (ns_get('xcal:x-calconnect-venue-name'), ns_get('xcal:x-calconnect-street')) item.location_name = item.location_name.strip() item.schema = self.schema item.title = title item.description = convert_entities(entry.description) item.url = entry.link start_dt = ns_get('xcal:dtstart') start_dt = dateutil.parser.parse(start_dt) # Upstream bug: They provide a UTC offset of +0000 which # means times in UTC, but they're actually times in # US/Eastern, so do *not* fix the zone. #start_dt = start_dt.astimezone(local_tz) item.item_date = start_dt.date() item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) item.location = Point((float(ns_get('geo:long')), float(ns_get('geo:lat')))) if (item.location.x, item.location.y) == (0.0, 0.0): logger.warn("Skipping %r, bad location 0,0" % item.title) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) logger.info(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name item.block = block except reverse.ReverseGeocodeError: logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title)) item.location_name = u'' item.save() item.attributes['start_time'] = start_dt.time() end_dt = ns_get('xcal:dtend') or u'' if end_dt.strip(): end_dt = dateutil.parser.parse(end_dt.strip()) #end_dt = end_dt.astimezone(local_tz) item.attributes['end_time'] = end_dt.time() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except: logger.error("unexpected error:", sys.exc_info()[1]) log_exception() logger.info("EventsCalendarScraper finished: %d added, %d updated of %s total" % (addcount, updatecount, seencount))
def save(self, old_record, list_record, detail_record): # TODO: move some of this to clean_list_record? date = datetime.date(*list_record['updated_parsed'][:3]) # Get the precinct from the tags. precincts = [ 'A1', 'A7', 'B2', 'B3', 'C11', 'C6', 'D14', 'D4', 'E13', 'E18', 'E5' ] precinct = None tags = [t['term'] for t in list_record['tags']] if not tags: return for precinct in tags: if precinct in precincts: # TODO: we need a LocationType for precincts, and shapes; and # then we could set newsitem.location_object to the Location # for this precinct. break if not precinct: self.logger.debug("no precinct found in tags %r" % tags) description = list_record['summary'] full_description = list_record['content'][0]['value'] full_description = text_from_html(full_description) addrs = parse_addresses(full_description) if not addrs: self.logger.info("no addresses found in %r %r" % (list_record['title'], list_record['link'])) return location = None location_name = u'' block = None # This feed doesn't provide geographic data; we'll try to # extract addresses from the text, and stop on the first # one that successfully geocodes. for addr, unused in addrs: addr = addr.strip() try: location = SmartGeocoder().geocode(addr) except (GeocodingException, ParsingError): log_exception(level=logging.DEBUG) continue location_name = location['address'] location = location['point'] break else: self.logger.info("no addresses geocoded in %r" % list_record['title']) return kwargs = dict( item_date=date, location=location, location_name=location_name, title=list_record['title'], description=description, url=list_record['link'], ) attributes = None self.create_or_update(old_record, attributes, **kwargs)
def update(self): logger.info("Starting LocalNewsScraper update %s" % self.url) try: schema = Schema.objects.get(slug=self.schema_slug) except Schema.DoesNotExist: logger.error("Schema (%s): DoesNotExist" % self.schema_slug) return 1 response, content = self.http.request(self.url) if response.fromcache: logger.info("Feed is unchanged since last update (cached)") return f = feedparser.parse(content) addcount = updatecount = 0 for entry in f.entries: title = convert_entities(entry.title) description = convert_entities(entry.description) if entry.id.startswith('http'): item_url = entry.id else: item_url = entry.link try: item = NewsItem.objects.get(schema__id=schema.id, title=title, description=description) #url=item_url) status = 'updated' except NewsItem.DoesNotExist: item = NewsItem() status = 'added' except NewsItem.MultipleObjectsReturned: # Seen some where we get the same story with multiple URLs. Why? logger.warn( "Multiple entries matched title %r and description %r. Expected unique!" % (title, description)) continue try: item.title = title item.schema = schema item.description = description item.url = item_url # Support both georss and xcal for getting the location name. # TODO: should also support ev:location per http://web.resource.org/rss/1.0/modules/event/ item.location_name = entry.get( 'xCal_x-calconnect-street') or entry.get( 'x-calconnect-street') or entry.get( 'georss_featurename') or entry.get('featurename') item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) _short_title = item.title[:30] + '...' # feedparser bug: depending on which parser it magically uses, # we either get the xml namespace in the key name, or we don't. point = entry.get('georss_point') or entry.get('point') x, y = None, None if point: # GeoRSS puts latitude (Y) first. y, x = point.split(' ') else: if item.location_name: text = item.location_name else: # Geocode whatever we can find. text = item.title + ' ' + item.description logger.debug("...Falling back on geocoding from %r..." % text[:50]) addrs = parse_addresses(text) for addr, unused in addrs: try: result = SmartGeocoder().geocode(addr) point = result['point'] logger.debug("internally geocoded %r" % addr) x, y = point.x, point.y if not item.location_name: item.location_name = result['address'] break except GeocodingException: logger.debug("Geocoding exception on %r:" % text) log_exception(level=logging.DEBUG) continue except: logger.error( 'uncaught geocoder exception on %r\n' % addr) log_exception() if None in (x, y): logger.debug( "Skip, couldn't geocode any addresses in item '%s...'" % _short_title) continue item.location = Point((float(x), float(y))) if not intersects_metro_bbox(item.location): reversed_loc = Point((float(y), float(x))) if intersects_metro_bbox(reversed_loc): logger.info( "Got points in apparently reverse order, flipping them" ) item.location = reversed_loc else: logger.info("Skipping %r as %s,%s is out of bounds" % (_short_title, y, x)) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode( item.location) logger.debug(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name except reverse.ReverseGeocodeError: logger.info( " Skip, failed to reverse geocode %s for %r" % (item.location.wkt, _short_title)) continue item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, _short_title)) except: logger.error("Warning: couldn't save %r. Traceback:" % _short_title) log_exception() logger.info("Finished LocalNewsScraper update: %d added, %d updated" % (addcount, updatecount))
try: place.pretty_name = pretty_name place.address = address place.location = point place.url = place_url place.place_type = place_type place.save() if created: message = 'Created new place %s' % (pretty_name) else: message = 'Updated place %s' % (pretty_name) context['actions_taken'].append(message) except: log_exception() message = 'Error adding place "%s"' % pretty_name context['errors'].append(message) continue # now update Synonyms # destroy synonyms not in the new list, identify new synonyms new_synonyms = set(synonyms) for synonym in PlaceSynonym.objects.filter(place=place).all(): if synonym.pretty_name not in new_synonyms: synonym.delete() message = 'Removing old synonym "%s" for "%s"' % ( synonym.pretty_name, pretty_name) context['actions_taken'].append(message) else:
def update(self): """ Download Calendar RSS feed and update database """ logger.info("Starting EventsCalendarScraper") feed = feedparser.parse(self.url) seencount = addcount = updatecount = 0 for entry in feed.entries: def ns_get(element): # work around feedparser unpredictability. namespace, element = element.split(':') result = entry.get('%s_%s' % (namespace, element)) if result is None: result = entry.get(element) return result seencount += 1 title = convert_entities(entry.title) try: item = NewsItem.objects.get(title=title, schema__id=self.schema.id) status = "updated" except NewsItem.DoesNotExist: item = NewsItem() status = "added" except NewsItem.MultipleObjectsReturned: logger.warn("Multiple entries matched title %r, event titles are not unique?" % title) continue try: item.location_name = '%s %s' % (ns_get('xcal:x-calconnect-venue-name'), ns_get('xcal:x-calconnect-street')) item.location_name = item.location_name.strip() item.schema = self.schema item.title = title item.description = convert_entities(entry.description) item.url = entry.link start_dt = ns_get('xcal:dtstart') start_dt = dateutil.parser.parse(start_dt) # Upstream bug: They provide a UTC offset of +0000 which # means times in UTC, but they're actually times in # US/Eastern, so do *not* fix the zone. #start_dt = start_dt.astimezone(local_tz) item.item_date = start_dt.date() item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) item.location = Point((float(ns_get('geo:long')), float(ns_get('geo:lat')))) if (item.location.x, item.location.y) == (0.0, 0.0): logger.warn("Skipping %r, bad location 0,0" % item.title) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) logger.info(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name except reverse.ReverseGeocodeError: logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title)) item.location_name = u'' item.save() item.attributes['start_time'] = start_dt.time() end_dt = ns_get('xcal:dtend') or u'' if end_dt.strip(): end_dt = dateutil.parser.parse(end_dt.strip()) #end_dt = end_dt.astimezone(local_tz) item.attributes['end_time'] = end_dt.time() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except: logger.error("unexpected error:", sys.exc_info()[1]) log_exception() logger.info("EventsCalendarScraper finished: %d added, %d updated of %s total" % (addcount, updatecount, seencount))
def update(): """ Download Calendar RSS feed and update database """ logger.info("Starting add_events") url = """http://calendar.boston.com/search?acat=&cat=&commit=Search\ &new=n&rss=1&search=true&sort=0&srad=20&srss=50&ssrss=5&st=event\ &st_select=any&svt=text&swhat=&swhen=today&swhere=&trim=1""" schema = 'events' try: schema = Schema.objects.get(slug=schema) except Schema.DoesNotExist: logger.error("Schema (%s): DoesNotExist" % schema) sys.exit(1) feed = feedparser.parse(url) addcount = updatecount = 0 for entry in feed.entries: title = convert_entities(entry.title) try: item = NewsItem.objects.get(title=title, schema__id=schema.id) status = "updated" except NewsItem.DoesNotExist: item = NewsItem() status = "added" except NewsItem.MultipleObjectsReturned: logger.warn( "Multiple entries matched title %r, event titles are not unique?" % title) continue try: item.location_name = entry.get( 'xcal_x-calconnect-street') or entry.get( 'x-calconnect-street') or u'' item.schema = schema item.title = title item.description = convert_entities(entry.description) item.url = entry.link item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) item.location = Point( (float(entry['geo_long']), float(entry['geo_lat']))) if (item.location.x, item.location.y) == (0.0, 0.0): logger.warn("Skipping %r, bad location 0,0" % item.title) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) logger.info(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name except reverse.ReverseGeocodeError: logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title)) item.location_name = u'' item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except: logger.error("unexpected error:", sys.exc_info()[1]) log_exception() logger.info("add_events finished: %d added, %d updated" % (addcount, updatecount))
def search(request, schema_slug=''): "Performs a location search and redirects to the address/xy page." # Check whether a schema was provided. if schema_slug: try: schema = get_schema_manager(request).get(slug=schema_slug) except Schema.DoesNotExist: raise Http404('Schema does not exist') url_prefix = schema.url()[:-1] else: schema = None url_prefix = '' # Get the query. q = request.GET.get('q', '').strip() if not q: return HttpResponseRedirect(url_prefix + '/') # TODO: Do something better than redirecting. # For /search/?type=alert, we redirect results to the alert page, not the # place page. if request.GET.get('type', '') == 'alert': url_method = 'alert_url' else: url_method = 'url' # Try to geocode it using full_geocode(). try: result = full_geocode(q, search_places=False) except: logger.debug('Unhandled exception from full_geocode:') log_exception(level=logging.DEBUG, logger=logger) else: if result['ambiguous']: if result['type'] == 'block': streets = [] street_blocks = {} for block in result['result']: street_name = block.street_pretty_name if street_name not in streets: streets.append(street_name) street_blocks[street_name] = [] street_blocks[street_name].append(block) choices = [{'name': s, 'blocks': street_blocks[s]} for s in streets] return eb_render(request, 'db/search_invalid_block.html', { 'query': q, 'choices': choices, }) else: return eb_render(request, 'db/did_you_mean.html', {'query': q, 'choices': result['result']}) elif result['type'] == 'location': return HttpResponseRedirect(url_prefix + getattr(result['result'], url_method)()) elif result['type'] == 'address': # Block if result['result']['block']: return HttpResponseRedirect(url_prefix + getattr(result['result']['block'], url_method)()) # Intersection try: intersection = Intersection.objects.get(id=result['result']['intersection_id']) except Intersection.DoesNotExist: pass else: return HttpResponseRedirect(url_prefix + getattr(intersection, url_method)()) # Failing the geocoding, look in the special-case table. try: special_case = SearchSpecialCase.objects.get(query=normalize(q)) except SearchSpecialCase.DoesNotExist: pass else: if special_case.redirect_to: return HttpResponseRedirect(special_case.redirect_to) else: return eb_render(request, 'db/search_special_case.html', {'query': q, 'special_case': special_case}) # Failing that, display a list of ZIP codes if this looks like a ZIP. if re.search(r'^\s*\d{5}(?:-\d{4})?\s*$', q): z_list = Location.objects.filter(location_type__slug='zipcodes', is_public=True).select_related().order_by('name') if z_list: return eb_render(request, 'db/search_error_zip_list.html', {'query': q, 'zipcode_list': z_list}) # Failing all of that, display the search error page. lt_list = LocationType.objects.filter(is_significant=True).order_by('name') return eb_render(request, 'db/search_error.html', {'query': q, 'locationtype_list': lt_list})
def update(argv=None): logger.info("Starting add_news") if argv: url = argv[0] else: url = "http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=massachusetts&scope=bonzai" schema_slug = "local-news" try: schema = Schema.objects.get(slug=schema_slug) except Schema.DoesNotExist: logger.error("Schema (%s): DoesNotExist" % schema_slug) sys.exit(1) f = feedparser.parse(url) addcount = updatecount = 0 for entry in f.entries: title = convert_entities(entry.title) description = convert_entities(entry.description) if entry.id.startswith("http"): item_url = entry.id else: item_url = entry.link try: item = NewsItem.objects.get(schema__id=schema.id, title=title, description=description) # url=item_url) status = "updated" except NewsItem.DoesNotExist: item = NewsItem() status = "added" except NewsItem.MultipleObjectsReturned: # Seen some where we get the same story with multiple URLs. Why? logger.warn("Multiple entries matched title %r and description %r. Expected unique!" % (title, description)) continue try: item.title = title item.schema = schema item.description = description item.url = item_url item.location_name = entry.get("x-calconnect-street") or entry.get("georss_featurename") item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) # feedparser bug: depending on which parser it magically uses, # we either get the xml namespace in the key name, or we don't. point = entry.get("georss_point") or entry.get("point") x, y = None, None if point: x, y = point.split(" ") if True: # Fall back on geocoding. text = item.title + " " + item.description addrs = parse_addresses(text) for addr, unused in addrs: try: result = SmartGeocoder().geocode(addr) point = result["point"] logger.debug("internally geocoded %r" % addr) x, y = point.x, point.y break except GeocodingException: logger.debug("Geocoding exception on %r:" % text) log_exception(level=logging.DEBUG) continue except: logger.error("uncaught geocoder exception on %r\n" % addr) log_exception() if None in (x, y): logger.info("couldn't geocode '%s...'" % item.title[:30]) continue item.location = Point((float(y), float(x))) if item.location.x == 0.0 and item.location.y == 0.0: # There's a lot of these. Maybe attempt to # parse and geocode if we haven't already? logger.info("Skipping %r as it has bad location 0,0" % item.title) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) logger.debug(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name item.block = block except reverse.ReverseGeocodeError: logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title)) item.location_name = u"" item.save() if status == "added": addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except: logger.error("Warning: couldn't save %r. Traceback:" % item.title) log_exception() logger.info("Finished add_news: %d added, %d updated" % (addcount, updatecount))
def update(): """ Download Calendar RSS feed and update database """ logger.info("Starting add_events") url = """http://calendar.boston.com/search?acat=&cat=&commit=Search\ &new=n&rss=1&search=true&sort=0&srad=20&srss=50&ssrss=5&st=event\ &st_select=any&svt=text&swhat=&swhen=today&swhere=&trim=1""" schema = 'events' try: schema = Schema.objects.get(slug=schema) except Schema.DoesNotExist: logger.error("Schema (%s): DoesNotExist" % schema) sys.exit(1) feed = feedparser.parse(url) addcount = updatecount = 0 for entry in feed.entries: title = convert_entities(entry.title) try: item = NewsItem.objects.get(title=title, schema__id=schema.id) status = "updated" except NewsItem.DoesNotExist: item = NewsItem() status = "added" except NewsItem.MultipleObjectsReturned: logger.warn("Multiple entries matched title %r, event titles are not unique?" % title) continue try: item.location_name = entry.get('xcal_x-calconnect-street') or entry.get('x-calconnect-street') or u'' item.schema = schema item.title = title item.description = convert_entities(entry.description) item.url = entry.link item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) item.location = Point((float(entry['geo_long']), float(entry['geo_lat']))) if (item.location.x, item.location.y) == (0.0, 0.0): logger.warn("Skipping %r, bad location 0,0" % item.title) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) logger.info(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name except reverse.ReverseGeocodeError: logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title)) item.location_name = u'' item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except: logger.error("unexpected error:", sys.exc_info()[1]) log_exception() logger.info("add_events finished: %d added, %d updated" % (addcount, updatecount))
def save(self, old_record, list_record, detail_record): # TODO: move some of this to clean_list_record? date = datetime.date(*list_record['updated_parsed'][:3]) # Get the precinct from the tags. precincts = ['A1', 'A7', 'B2', 'B3', 'C11', 'C6', 'D14', 'D4', 'E13', 'E18', 'E5'] precinct = None tags = [t['term'] for t in list_record['tags']] if not tags: return for precinct in tags: if precinct in precincts: # TODO: we need a LocationType for precincts, and shapes; and # then we could set newsitem.location_object to the Location # for this precinct. break if not precinct: self.logger.debug("no precinct found in tags %r" % tags) description = list_record['summary'] full_description = list_record['content'][0]['value'] full_description = text_from_html(full_description) addrs = parse_addresses(full_description) if not addrs: self.logger.info("no addresses found in %r %r" % (list_record['title'], list_record['link'])) return location = None location_name = u'' block = None # This feed doesn't provide geographic data; we'll try to # extract addresses from the text, and stop on the first # one that successfully geocodes. for addr, unused in addrs: addr = addr.strip() try: location = SmartGeocoder().geocode(addr) except (GeocodingException, ParsingError): log_exception(level=logging.DEBUG) continue location_name = location['address'] location = location['point'] break else: self.logger.info("no addresses geocoded in %r" % list_record['title']) return kwargs = dict(item_date=date, location=location, location_name=location_name, title=list_record['title'], description=description, url=list_record['link'], ) attributes = None self.create_or_update(old_record, attributes, **kwargs)
def update(argv=None): logger.info("Starting add_news") if argv: url = argv[0] else: url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=massachusetts&scope=bonzai' schema_slug = 'local-news' try: schema = Schema.objects.get(slug=schema_slug) except Schema.DoesNotExist: logger.error("Schema (%s): DoesNotExist" % schema_slug) sys.exit(1) f = feedparser.parse(url) addcount = updatecount = 0 for entry in f.entries: title = convert_entities(entry.title) description = convert_entities(entry.description) if entry.id.startswith('http'): item_url = entry.id else: item_url = entry.link try: item = NewsItem.objects.get(schema__id=schema.id, title=title, description=description) #url=item_url) status = 'updated' except NewsItem.DoesNotExist: item = NewsItem() status = 'added' except NewsItem.MultipleObjectsReturned: # Seen some where we get the same story with multiple URLs. Why? logger.warn( "Multiple entries matched title %r and description %r. Expected unique!" % (title, description)) continue try: item.title = title item.schema = schema item.description = description item.url = item_url item.location_name = entry.get('x-calconnect-street') or entry.get( 'georss_featurename') item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) # feedparser bug: depending on which parser it magically uses, # we either get the xml namespace in the key name, or we don't. point = entry.get('georss_point') or entry.get('point') x, y = None, None if point: x, y = point.split(' ') if True: # Fall back on geocoding. text = item.title + ' ' + item.description addrs = parse_addresses(text) for addr, unused in addrs: try: result = SmartGeocoder().geocode(addr) point = result['point'] logger.debug("internally geocoded %r" % addr) x, y = point.x, point.y break except (GeocodingException, ParsingError): logger.debug("Geocoding exception on %r:" % text) log_exception(level=logging.DEBUG) continue except: logger.error('uncaught geocoder exception on %r\n' % addr) log_exception() if None in (x, y): logger.info("couldn't geocode '%s...'" % item.title[:30]) continue item.location = Point((float(y), float(x))) if item.location.x == 0.0 and item.location.y == 0.0: # There's a lot of these. Maybe attempt to # parse and geocode if we haven't already? logger.info("Skipping %r as it has bad location 0,0" % item.title) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) logger.debug(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name except reverse.ReverseGeocodeError: logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title)) item.location_name = u'' item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except: logger.error("Warning: couldn't save %r. Traceback:" % item.title) log_exception() logger.info("Finished add_news: %d added, %d updated" % (addcount, updatecount))