def clean_list_record(self, record): record.title = convert_entities(record['title']) record.description = convert_entities(record['description']) # Don't know why, but some feeds have 'id' *instead* of 'link'. if record.get('id', '').startswith('http'): record['link'] = record['id'] # This tries GeoRSS, RDF Geo, xCal, ... point, location_name = self.get_point_and_location_name(record) _short_title = record['title'][:30] + '...' if not point: raise SkipRecord("couldn't geocode any addresses in item '%s...'" % _short_title) if not location_name: raise SkipRecord( "Skip, no location name and failed to reverse geocode %s for %r" % (point.wkt, _short_title)) if not intersects_metro_bbox(point): # Check if latitude, longitude seem to be reversed; I've # seen that in some bad feeds! reversed_loc = Point(point.y, point.x) if intersects_metro_bbox(reversed_loc): self.logger.info( "Got points in apparently reverse order, flipping them") point = reversed_loc else: raise SkipRecord("Skipping %r as %s,%s is out of bounds" % (_short_title, point.y, point.x)) record['location_name'] = location_name record['location'] = point return record
print "Found %r outside bounds at %s, %s" % (item.title, loc.x, loc.y) else: loc = None print "NO location on %s: %s" % (item.schema.slug, item.title) fixed = False if item.location_name: from ebpub.geocoder import SmartGeocoder, AmbiguousResult try: result = SmartGeocoder().geocode(item.location_name) except AmbiguousResult, e: print "...%d choices, picking the first one" % len(e.choices) result = e.choices[0] except: result = None if result and intersects_metro_bbox(result['point']): print "Fixing %r by geocoding %r" % (item.title, item.location_name) item.location = result['point'] fixed = True if loc and not fixed: newloc = Point(loc.y, loc.x) if intersects_metro_bbox(newloc): print "Fixing %r by flipping bounds" % item.title item.location = newloc fixed = True if fixed: if not dry_run: print "saving %s" % item item.save()
def update(xmlfile, options): logger.info("Scraping University of Missouri police reports") if options.days == -1: start_date = datetime.date(1970, 1, 1) else: start_date = datetime.date.today() - datetime.timedelta(days=options.days) schema_slug = 'mupd' try: schema = Schema.objects.get(slug=schema_slug) except Schema.DoesNotExist: logger.error( "Schema (%s): DoesNotExist" % schema_slug) sys.exit(1) # We use iterparse() to avoid keeping the whole xml tree in memory, # this is a pretty big file. # See http://effbot.org/zone/element-iterparse.htm context = iter(lxml.etree.iterparse(xmlfile, events=('start', 'end'))) addcount = updatecount = 0 event, root = context.next() for event, elem in context: if event == 'end' and elem.tag == 'Table': category = cleanup(elem.findtext('Description')) lat = cleanup(elem.findtext('Lat')) lon = cleanup(elem.findtext('Lon')) item_date = cleanup(elem.findtext('CreateDatetime')) house_number = cleanup(elem.findtext('HouseNumber')) prefix = cleanup(elem.findtext('StreetPrefix')) street = cleanup(elem.findtext('StreetName')) streettype = cleanup(elem.findtext('StreetType')) suffix = cleanup(elem.findtext('StreetSuffix')) incident_number = cleanup(elem.findtext('IncidentNumber')) # We're done with this <Table> tag; clear the root element # that iterparse is building to avoid bloating memory with # empty elements. root.clear() else: continue if item_date: item_date = pyrfc3339.parse(item_date) if item_date.date() < start_date: logger.debug("Date %s is older than start date, skipping." % item_date) continue else: logger.debug("No parsable date, skipping.") continue location_parts = [house_number, prefix, street, streettype, suffix] location_name = ' '.join([s for s in location_parts if s]) if location_name: title = '%s: %s' % (location_name.title(), category.title()) else: title = category.title() try: lon, lat = float(lon), float(lat) location = Point(lon, lat) except ValueError: location = None if location and not intersects_metro_bbox(location): logger.info("SKIP %s (at %s), not within our metro area" % (title, (location.x, location.y))) continue cat_field = SchemaField.objects.get(schema=schema, name='category') cat_lookup = Lookup.objects.get_or_create_lookup( cat_field, category, category, "", False) attributes = {'incident_number': incident_number, 'category': cat_lookup.id} incident_number_field = SchemaField.objects.get(schema=schema, name='incident_number') try: item = NewsItem.objects.filter(schema__id=schema.id).by_attribute(incident_number_field, incident_number)[0] status = 'updated' except IndexError: item = NewsItem(pub_date=datetime.datetime.now()) status = 'added' except NewsItem.MultipleObjectsReturned: logger.warn("Multiple entries matched incident_number %s" % incident_number) continue logger.debug("%s %s" % (status, incident_number)) try: item.title = title item.schema = schema item.item_date = item_date.date() item.description = title # We don't have anything more verbose! item.location = location item.location_name = location_name item.save() item.attributes = attributes if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, item.title)) except: logger.error("Warning: couldn't save %r. Traceback: %s" % (item.title, traceback.format_exc())) logger.info("Finished scraping police reports: %d added, %d updated" % (addcount, updatecount))
def update(self, searchTerm, searchOffset): youtubeAPI = YouTubeAPI() numentries = 50 #How many results do we want the API to return logger.info("Starting YouTube_Scraper") response = youtubeAPI.runQuery(searchTerm, numentries, searchOffset) seencount = addcount = updatecount = 0 if response: for entry in response: seencount += 1 count = 0 while count != 9: if 'ns'+ str(count) + ':title' in entry: if entry['ns'+ str(count) + ':title'] != '': title = entry['ns'+ str(count) + ':title'] count += 1 else: logger.info("Skipping, as title is empty.") continue else: count += 1 try: newsItem = NewsItem.objects.get(title=title,schema__id=self.schema.id) status = "updated" except NewsItem.DoesNotExist: newsItem = NewsItem() status = "added" except NewsItem.MultipleObjectsReturned: logger.warn("Multiple entries matched title %r, event titles are not unique?" % title) continue try: newsItem.schema = self.schema count = 0 while count != 9: if 'ns'+ str(count) + ':description' in entry: if entry['ns'+ str(count) + ':description'] != '': newsItem.description = entry['ns'+ str(count) + ':description'] break else: logger.info("Skipping %r as description is empty." % (title)) continue else: count += 1 newsItem.url = entry['ns0:link'] count = 0 while count != 9: if 'ns'+ str(count) + ':title' in entry: if entry['ns'+ str(count) + ':title'] != '': newsItem.title = entry['ns'+ str(count) + ':title'] count += 1 else: logger.info("Skipping, as title is empty.") continue else: count += 1 # newsItem.item_date = datetime.datetime.now() count = 0 while count != 9: if 'ns'+ str(count) + ':published' in entry: yt_timedate = string.split(entry['ns'+ str(count) + ':published'],'T') break else: count += 1 date = yt_timedate[0] time = string.split(yt_timedate[1],'Z') formatted = date #date + " " + time[0] + "000" #Used to include timestamps newsItem.pub_date = datetime.datetime.now() newsItem.item_date = formatted.encode( "utf-8" ) _short_title = newsItem.title[:30] + '...' #newsItem.location_name = 'Kent' count = 0 while count != 9: if 'ns'+ str(count) + ':pos' in entry: long_lat = string.split(entry['ns'+ str(count) + ':pos']) break else: count += 1 newsItem.location = Point(float(long_lat[1]),float(long_lat[0])) x, y = float(long_lat[0]), float(long_lat[1]) if not intersects_metro_bbox(newsItem.location): reversed_loc = Point((float(y), float(x))) if intersects_metro_bbox(reversed_loc): logger.info( "Got points in apparently reverse order, flipping them") newsItem.location = reversed_loc else: logger.info("Skipping %r as %s,%s is out of bounds" % (_short_title, y, x)) continue if not newsItem.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(newsItem.location) logger.debug(" Reverse-geocoded point to %r" % block.pretty_name) newsItem.location_name = block.pretty_name newsItem.block = block except reverse.ReverseGeocodeError: logger.info(" Skip, failed to reverse geocode %s for %r" % (newsItem.location.wkt, _short_title)) continue attributes_ = {} attributes_['photo_href'] = entry['ns0:thumb'] attributes_['videoID'] = entry['ns0:video_id'] attributes_['searchTerm'] = searchTerm newsItem.save() newsItem.attributes = attributes_ newsItem.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, newsItem.title)) except Exception as e: logger.exception("unexpected error: %s" % e) logger.info("YouTube_Scraper finished: %d added, %d updated of %s total" % (addcount, updatecount, seencount))
def update(self): logger.info("Starting LocalNewsScraper update %s" % self.url) try: schema = Schema.objects.get(slug=self.schema_slug) except Schema.DoesNotExist: logger.error( "Schema (%s): DoesNotExist" % self.schema_slug) return 1 response, content = self.http.request(self.url) if response.fromcache: logger.info("Feed is unchanged since last update (cached)") return f = feedparser.parse(content) addcount = updatecount = 0 for entry in f.entries: title = convert_entities(entry.title) description = convert_entities(entry.description) if entry.id.startswith('http'): item_url = entry.id else: item_url = entry.link try: item = NewsItem.objects.get(schema__id=schema.id, title=title, description=description) #url=item_url) status = 'updated' except NewsItem.DoesNotExist: item = NewsItem() status = 'added' except NewsItem.MultipleObjectsReturned: # Seen some where we get the same story with multiple URLs. Why? logger.warn("Multiple entries matched title %r and description %r. Expected unique!" % (title, description)) continue try: item.title = title item.schema = schema item.description = description item.url = item_url # Support both georss and xcal for getting the location name. # TODO: should also support ev:location per http://web.resource.org/rss/1.0/modules/event/ item.location_name = entry.get('xCal_x-calconnect-street') or entry.get('x-calconnect-street') or entry.get('georss_featurename') or entry.get('featurename') item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) _short_title = item.title[:30] + '...' # feedparser bug: depending on which parser it magically uses, # we either get the xml namespace in the key name, or we don't. point = entry.get('georss_point') or entry.get('point') x, y = None, None if point: # GeoRSS puts latitude (Y) first. y, x = point.split(' ') else: if item.location_name: text = item.location_name else: # Geocode whatever we can find. text = item.title + ' ' + item.description logger.debug("...Falling back on geocoding from %r..." % text[:50]) addrs = parse_addresses(text) for addr, unused in addrs: try: result = SmartGeocoder().geocode(addr) point = result['point'] logger.debug("internally geocoded %r" % addr) x, y = point.x, point.y if not item.location_name: item.location_name = result['address'] item.block = result['block'] break except GeocodingException: logger.debug("Geocoding exception on %r:" % text) log_exception(level=logging.DEBUG) continue except: logger.error('uncaught geocoder exception on %r\n' % addr) log_exception() if None in (x, y): logger.debug("Skip, couldn't geocode any addresses in item '%s...'" % _short_title) continue item.location = Point((float(x), float(y))) if not intersects_metro_bbox(item.location): reversed_loc = Point((float(y), float(x))) if intersects_metro_bbox(reversed_loc): logger.info( "Got points in apparently reverse order, flipping them") item.location = reversed_loc else: logger.info("Skipping %r as %s,%s is out of bounds" % (_short_title, y, x)) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode(item.location) logger.debug(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name item.block = block except reverse.ReverseGeocodeError: logger.info(" Skip, failed to reverse geocode %s for %r" % (item.location.wkt, _short_title)) continue item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, _short_title)) except: logger.error("Warning: couldn't save %r. Traceback:" % _short_title) log_exception() logger.info("Finished LocalNewsScraper update: %d added, %d updated" % (addcount, updatecount))
def update(self): logger.info("Starting LocalNewsScraper update %s" % self.url) try: schema = Schema.objects.get(slug=self.schema_slug) except Schema.DoesNotExist: logger.error("Schema (%s): DoesNotExist" % self.schema_slug) return 1 response, content = self.http.request(self.url) if response.fromcache: logger.info("Feed is unchanged since last update (cached)") return f = feedparser.parse(content) addcount = updatecount = 0 for entry in f.entries: title = convert_entities(entry.title) description = convert_entities(entry.description) if entry.id.startswith('http'): item_url = entry.id else: item_url = entry.link try: item = NewsItem.objects.get(schema__id=schema.id, title=title, description=description) #url=item_url) status = 'updated' except NewsItem.DoesNotExist: item = NewsItem() status = 'added' except NewsItem.MultipleObjectsReturned: # Seen some where we get the same story with multiple URLs. Why? logger.warn( "Multiple entries matched title %r and description %r. Expected unique!" % (title, description)) continue try: item.title = title item.schema = schema item.description = description item.url = item_url # Support both georss and xcal for getting the location name. # TODO: should also support ev:location per http://web.resource.org/rss/1.0/modules/event/ item.location_name = entry.get( 'xCal_x-calconnect-street') or entry.get( 'x-calconnect-street') or entry.get( 'georss_featurename') or entry.get('featurename') item.item_date = datetime.datetime(*entry.updated_parsed[:6]) item.pub_date = datetime.datetime(*entry.updated_parsed[:6]) _short_title = item.title[:30] + '...' # feedparser bug: depending on which parser it magically uses, # we either get the xml namespace in the key name, or we don't. point = entry.get('georss_point') or entry.get('point') x, y = None, None if point: # GeoRSS puts latitude (Y) first. y, x = point.split(' ') else: if item.location_name: text = item.location_name else: # Geocode whatever we can find. text = item.title + ' ' + item.description logger.debug("...Falling back on geocoding from %r..." % text[:50]) addrs = parse_addresses(text) for addr, unused in addrs: try: result = SmartGeocoder().geocode(addr) point = result['point'] logger.debug("internally geocoded %r" % addr) x, y = point.x, point.y if not item.location_name: item.location_name = result['address'] break except GeocodingException: logger.debug("Geocoding exception on %r:" % text) log_exception(level=logging.DEBUG) continue except: logger.error( 'uncaught geocoder exception on %r\n' % addr) log_exception() if None in (x, y): logger.debug( "Skip, couldn't geocode any addresses in item '%s...'" % _short_title) continue item.location = Point((float(x), float(y))) if not intersects_metro_bbox(item.location): reversed_loc = Point((float(y), float(x))) if intersects_metro_bbox(reversed_loc): logger.info( "Got points in apparently reverse order, flipping them" ) item.location = reversed_loc else: logger.info("Skipping %r as %s,%s is out of bounds" % (_short_title, y, x)) continue if not item.location_name: # Fall back to reverse-geocoding. from ebpub.geocoder import reverse try: block, distance = reverse.reverse_geocode( item.location) logger.debug(" Reverse-geocoded point to %r" % block.pretty_name) item.location_name = block.pretty_name except reverse.ReverseGeocodeError: logger.info( " Skip, failed to reverse geocode %s for %r" % (item.location.wkt, _short_title)) continue item.save() if status == 'added': addcount += 1 else: updatecount += 1 logger.info("%s: %s" % (status, _short_title)) except: logger.error("Warning: couldn't save %r. Traceback:" % _short_title) log_exception() logger.info("Finished LocalNewsScraper update: %d added, %d updated" % (addcount, updatecount))
loc = item.location.centroid print "Found %r outside bounds at %s, %s" % (item.title, loc.x, loc.y) else: loc = None print "NO location on %s: %s" % (item.schema.slug, item.title) fixed = False if item.location_name: from ebpub.geocoder import SmartGeocoder, AmbiguousResult try: result = SmartGeocoder().geocode(item.location_name) except AmbiguousResult, e: print "...%d choices, picking the first one" % len(e.choices) result = e.choices[0] except: result = None if result and intersects_metro_bbox(result['point']): print "Fixing %r by geocoding %r" % (item.title, item.location_name) item.location = result['point'] fixed = True if loc and not fixed: newloc = Point(loc.y, loc.x) if intersects_metro_bbox(newloc): print "Fixing %r by flipping bounds" % item.title item.location = newloc fixed = True if fixed: if not dry_run: print "saving %s" % item