Exemple #1
0
    def clean_list_record(self, record):
        record.title = convert_entities(record['title'])
        record.description = convert_entities(record['description'])
        # Don't know why, but some feeds have 'id' *instead* of 'link'.
        if record.get('id', '').startswith('http'):
            record['link'] = record['id']

        # This tries GeoRSS, RDF Geo, xCal, ...
        point, location_name = self.get_point_and_location_name(record)

        _short_title = record['title'][:30] + '...'

        if not point:
            raise SkipRecord("couldn't geocode any addresses in item '%s...'"
                             % _short_title)

        if not location_name:
            raise SkipRecord(
                "Skip, no location name and failed to reverse geocode %s for %r" % (point.wkt, _short_title))

        if not intersects_metro_bbox(point):
            # Check if latitude, longitude seem to be reversed; I've
            # seen that in some bad feeds!
            reversed_loc = Point(point.y, point.x)
            if intersects_metro_bbox(reversed_loc):
                self.logger.info(
                    "Got points in apparently reverse order, flipping them")
                point = reversed_loc
            else:
                raise SkipRecord("Skipping %r as %s,%s is out of bounds" %
                                 (_short_title, point.y, point.x))

        record['location_name'] = location_name
        record['location'] = point
        return record
Exemple #2
0
    def clean_list_record(self, record):
        record.title = convert_entities(record['title'])
        record.description = convert_entities(record['description'])
        # Don't know why, but some feeds have 'id' *instead* of 'link'.
        if record.get('id', '').startswith('http'):
            record['link'] = record['id']

        # This tries GeoRSS, RDF Geo, xCal, ...
        point, location_name = self.get_point_and_location_name(record)

        _short_title = record['title'][:30] + '...'

        if not point:
            raise SkipRecord("couldn't geocode any addresses in item '%s...'" %
                             _short_title)

        if not location_name:
            raise SkipRecord(
                "Skip, no location name and failed to reverse geocode %s for %r"
                % (point.wkt, _short_title))

        if not intersects_metro_bbox(point):
            # Check if latitude, longitude seem to be reversed; I've
            # seen that in some bad feeds!
            reversed_loc = Point(point.y, point.x)
            if intersects_metro_bbox(reversed_loc):
                self.logger.info(
                    "Got points in apparently reverse order, flipping them")
                point = reversed_loc
            else:
                raise SkipRecord("Skipping %r as %s,%s is out of bounds" %
                                 (_short_title, point.y, point.x))

        record['location_name'] = location_name
        record['location'] = point
        return record
        print "Found %r outside bounds at %s, %s" % (item.title,
                                                     loc.x, loc.y)
    else:
        loc = None
        print "NO location on %s: %s" % (item.schema.slug, item.title)
    fixed = False
    if item.location_name:
        from ebpub.geocoder import SmartGeocoder, AmbiguousResult
        try:
            result = SmartGeocoder().geocode(item.location_name)
        except AmbiguousResult, e:
            print "...%d choices, picking the first one" % len(e.choices)
            result = e.choices[0]
        except:
            result = None
        if result and intersects_metro_bbox(result['point']):
            print "Fixing %r by geocoding %r" % (item.title, item.location_name)
            item.location = result['point']
            fixed = True

    if loc and not fixed:
        newloc = Point(loc.y, loc.x)
        if intersects_metro_bbox(newloc):
            print "Fixing %r by flipping bounds" % item.title
            item.location = newloc
            fixed = True

    if fixed:
        if not dry_run:
            print "saving %s" % item
            item.save()
def update(xmlfile, options):
    logger.info("Scraping University of Missouri police reports")

    if options.days == -1:
        start_date = datetime.date(1970, 1, 1)
    else:
        start_date = datetime.date.today() - datetime.timedelta(days=options.days)

    schema_slug = 'mupd'
    try:
        schema = Schema.objects.get(slug=schema_slug)
    except Schema.DoesNotExist:
        logger.error( "Schema (%s): DoesNotExist" % schema_slug)
        sys.exit(1)

    # We use iterparse() to avoid keeping the whole xml tree in memory,
    # this is a pretty big file.
    # See http://effbot.org/zone/element-iterparse.htm
    context = iter(lxml.etree.iterparse(xmlfile, events=('start', 'end')))
    addcount = updatecount = 0
    event, root = context.next()
    for event, elem in context:
        if event == 'end' and elem.tag == 'Table':
            category = cleanup(elem.findtext('Description'))
            lat = cleanup(elem.findtext('Lat'))
            lon = cleanup(elem.findtext('Lon'))
            item_date = cleanup(elem.findtext('CreateDatetime'))
            house_number = cleanup(elem.findtext('HouseNumber'))
            prefix = cleanup(elem.findtext('StreetPrefix'))
            street = cleanup(elem.findtext('StreetName'))
            streettype = cleanup(elem.findtext('StreetType'))
            suffix = cleanup(elem.findtext('StreetSuffix'))
            incident_number = cleanup(elem.findtext('IncidentNumber'))
            # We're done with this <Table> tag; clear the root element
            # that iterparse is building to avoid bloating memory with
            # empty elements.
            root.clear()
        else:
            continue

        if item_date:
            item_date = pyrfc3339.parse(item_date)
            if item_date.date() < start_date:
                logger.debug("Date %s is older than start date, skipping." % item_date)
                continue
        else:
            logger.debug("No parsable date, skipping.")
            continue

        location_parts = [house_number, prefix, street, streettype, suffix]
        location_name = ' '.join([s for s in location_parts if s])
        if location_name:
            title = '%s: %s' % (location_name.title(), category.title())
        else:
            title = category.title()

        try:
            lon, lat = float(lon), float(lat)
            location = Point(lon, lat)
        except ValueError:
            location = None

        if location and not intersects_metro_bbox(location):
            logger.info("SKIP %s (at %s), not within our metro area"
                        % (title, (location.x, location.y)))
            continue

        cat_field = SchemaField.objects.get(schema=schema, name='category')
        cat_lookup = Lookup.objects.get_or_create_lookup(
            cat_field, category, category, "", False)

        attributes = {'incident_number': incident_number,
                      'category': cat_lookup.id}

        incident_number_field = SchemaField.objects.get(schema=schema,
                                                        name='incident_number')
        try:
            item = NewsItem.objects.filter(schema__id=schema.id).by_attribute(incident_number_field, incident_number)[0]
            status = 'updated'
        except IndexError:
            item = NewsItem(pub_date=datetime.datetime.now())
            status = 'added'
        except NewsItem.MultipleObjectsReturned:
            logger.warn("Multiple entries matched incident_number %s" % incident_number)
            continue
        logger.debug("%s %s" % (status, incident_number))
        try:
            item.title = title
            item.schema = schema
            item.item_date = item_date.date()
            item.description = title # We don't have anything more verbose!
            item.location = location
            item.location_name = location_name
            item.save()
            item.attributes = attributes
            if status == 'added':
                addcount += 1
            else:
                updatecount += 1
            logger.info("%s: %s" % (status, item.title))
        except:
            logger.error("Warning: couldn't save %r. Traceback: %s" % (item.title, traceback.format_exc()))
    logger.info("Finished scraping police reports: %d added, %d updated" % (addcount, updatecount))
        def update(self, searchTerm, searchOffset):
                youtubeAPI = YouTubeAPI()
                numentries = 50 #How many results do we want the API to return
                logger.info("Starting YouTube_Scraper")
                response = youtubeAPI.runQuery(searchTerm, numentries, searchOffset)
                seencount = addcount = updatecount = 0
                if response:
                                for entry in response:
                                                seencount += 1
                                                count = 0
                                                while count != 9:
							if 'ns'+ str(count) + ':title' in entry:
								if entry['ns'+ str(count) + ':title'] != '':
									title = entry['ns'+ str(count) + ':title']
									count += 1
                                                        	else:
									logger.info("Skipping, as title is empty.")
									continue
							else:
								count += 1
                                                try:
                                                        newsItem = NewsItem.objects.get(title=title,schema__id=self.schema.id)
                                                        status = "updated"
                                                except NewsItem.DoesNotExist:
                                                        newsItem = NewsItem()
                                                        status = "added"
                                                except NewsItem.MultipleObjectsReturned:
                                                        logger.warn("Multiple entries matched title %r, event titles are not unique?" % title)
                                                        continue
                                                try:
                                                        newsItem.schema = self.schema
                                                        count = 0
                                                        while count != 9:
                                                                if 'ns'+ str(count) + ':description' in entry:
                                                                        if entry['ns'+ str(count) + ':description'] != '':
                                                                                newsItem.description = entry['ns'+ str(count) + ':description']
                                                                                break
                                                                        else:
                                                                              logger.info("Skipping %r as description is empty." % (title))
                                                                              continue
                                                                else:
                                                                        count += 1
                                                        newsItem.url = entry['ns0:link']
                                                        count = 0
                                                        while count != 9:
                                                                if 'ns'+ str(count) + ':title' in entry:
                                                                        if entry['ns'+ str(count) + ':title'] != '':
                                                                                newsItem.title = entry['ns'+ str(count) + ':title']
                                                                                count += 1
                                                                        else:
                                                                                logger.info("Skipping, as title is empty.")
                                                                                continue
                                                                else:
                                                                        count += 1
                                                        # newsItem.item_date = datetime.datetime.now()
                                                        count = 0
                                                        while count != 9:
                                                                if 'ns'+ str(count) + ':published' in entry:
                                                                        yt_timedate = string.split(entry['ns'+ str(count) + ':published'],'T')
                                                                        break
                                                                else:
                                                                        count += 1
                                                        date = yt_timedate[0]
                                                        time = string.split(yt_timedate[1],'Z')
                                                        formatted = date
							#date + " " + time[0] + "000" #Used to include timestamps
                                                        newsItem.pub_date = datetime.datetime.now()
                                                        newsItem.item_date = formatted.encode( "utf-8" )
							_short_title = newsItem.title[:30] + '...'
                                                        #newsItem.location_name = 'Kent'
                                                        count = 0
                                                        while count != 9:
                                                                if 'ns'+ str(count) + ':pos' in entry:
                                                                        long_lat = string.split(entry['ns'+ str(count) + ':pos'])
                                                                        break
                                                                else:
                                                                        count += 1
                                                        newsItem.location = Point(float(long_lat[1]),float(long_lat[0]))
							x, y = float(long_lat[0]), float(long_lat[1])
                                                        if not intersects_metro_bbox(newsItem.location):
                                                                reversed_loc = Point((float(y), float(x)))
                                                                if intersects_metro_bbox(reversed_loc):
                                                                        logger.info(
                                                                                "Got points in apparently reverse order, flipping them")
                                                                        newsItem.location = reversed_loc
                                                                else:
                                                                        logger.info("Skipping %r as %s,%s is out of bounds" % (_short_title, y, x))
                                                                        continue
                                                        if not newsItem.location_name:
                                                                # Fall back to reverse-geocoding.
                                                                from ebpub.geocoder import reverse
                                                                try:
                                                                        block, distance = reverse.reverse_geocode(newsItem.location)
                                                                        logger.debug(" Reverse-geocoded point to %r" % block.pretty_name)
                                                                        newsItem.location_name = block.pretty_name
                                                                        newsItem.block = block
                                                                except reverse.ReverseGeocodeError:
                                                                        logger.info(" Skip, failed to reverse geocode %s for %r" % (newsItem.location.wkt, _short_title))
                                                                        continue

                                                        attributes_ = {}
                                                        attributes_['photo_href'] = entry['ns0:thumb']
                                                        attributes_['videoID'] = entry['ns0:video_id']
							attributes_['searchTerm'] = searchTerm
    
                                                        newsItem.save()
                                                        newsItem.attributes = attributes_
                                                        newsItem.save()
                                                        
                                                        if status == 'added':
                                                                addcount += 1
                                                        else:
                                                                updatecount += 1
                                                        logger.info("%s: %s" % (status, newsItem.title))
                                                except Exception as e:
                                                        logger.exception("unexpected error: %s" % e)
                logger.info("YouTube_Scraper finished: %d added, %d updated of %s total" % (addcount, updatecount, seencount))
Exemple #6
0
    def update(self):
        logger.info("Starting LocalNewsScraper update %s" % self.url)

        try:
            schema = Schema.objects.get(slug=self.schema_slug)
        except Schema.DoesNotExist:
            logger.error( "Schema (%s): DoesNotExist" % self.schema_slug)
            return 1

        response, content = self.http.request(self.url)
        if response.fromcache:
            logger.info("Feed is unchanged since last update (cached)")
            return

        f = feedparser.parse(content)
        addcount = updatecount = 0
        for entry in f.entries:
            title = convert_entities(entry.title)
            description = convert_entities(entry.description)

            if entry.id.startswith('http'):
                item_url = entry.id
            else:
                item_url = entry.link
            try:
                item = NewsItem.objects.get(schema__id=schema.id,
                                            title=title,
                                            description=description)
                #url=item_url)
                status = 'updated'
            except NewsItem.DoesNotExist:
                item = NewsItem()
                status = 'added'
            except NewsItem.MultipleObjectsReturned:
                # Seen some where we get the same story with multiple URLs. Why?
                logger.warn("Multiple entries matched title %r and description %r. Expected unique!" % (title, description))
                continue
            try:
                item.title = title
                item.schema = schema
                item.description = description
                item.url = item_url
                # Support both georss and xcal for getting the location name.
                # TODO: should also support ev:location per http://web.resource.org/rss/1.0/modules/event/
                item.location_name = entry.get('xCal_x-calconnect-street') or entry.get('x-calconnect-street') or entry.get('georss_featurename') or entry.get('featurename')
                item.item_date = datetime.datetime(*entry.updated_parsed[:6])
                item.pub_date = datetime.datetime(*entry.updated_parsed[:6])
                _short_title = item.title[:30] + '...'

                # feedparser bug: depending on which parser it magically uses,
                # we either get the xml namespace in the key name, or we don't.
                point = entry.get('georss_point') or entry.get('point')
                x, y = None, None
                if point:
                    # GeoRSS puts latitude (Y) first.
                    y, x = point.split(' ')
                else:
                    if item.location_name:
                        text = item.location_name
                    else:
                        # Geocode whatever we can find.
                        text = item.title + ' ' + item.description
                    logger.debug("...Falling back on geocoding from %r..." % text[:50])
                    addrs = parse_addresses(text)
                    for addr, unused in addrs:
                        try:
                            result = SmartGeocoder().geocode(addr)
                            point = result['point']
                            logger.debug("internally geocoded %r" % addr)
                            x, y = point.x, point.y
                            if not item.location_name:
                                item.location_name = result['address']
                            item.block = result['block']
                            break
                        except GeocodingException:
                            logger.debug("Geocoding exception on %r:" % text)
                            log_exception(level=logging.DEBUG)
                            continue
                        except:
                            logger.error('uncaught geocoder exception on %r\n' % addr)
                            log_exception()
                    if None in (x, y):
                        logger.debug("Skip, couldn't geocode any addresses in item '%s...'"
                                     % _short_title)
                        continue
                item.location = Point((float(x), float(y)))
                if not intersects_metro_bbox(item.location):
                    reversed_loc = Point((float(y), float(x)))
                    if intersects_metro_bbox(reversed_loc):
                        logger.info(
                            "Got points in apparently reverse order, flipping them")
                        item.location = reversed_loc
                    else:
                        logger.info("Skipping %r as %s,%s is out of bounds" %
                                    (_short_title, y, x))
                        continue
                if not item.location_name:
                    # Fall back to reverse-geocoding.
                    from ebpub.geocoder import reverse
                    try:
                        block, distance = reverse.reverse_geocode(item.location)
                        logger.debug(" Reverse-geocoded point to %r" % block.pretty_name)
                        item.location_name = block.pretty_name
                        item.block = block
                    except reverse.ReverseGeocodeError:
                        logger.info(" Skip, failed to reverse geocode %s for %r" % (item.location.wkt, _short_title))
                        continue
                item.save()
                if status == 'added':
                    addcount += 1
                else:
                    updatecount += 1
                logger.info("%s: %s" % (status, _short_title))
            except:
                logger.error("Warning: couldn't save %r. Traceback:" % _short_title)
                log_exception()
        logger.info("Finished LocalNewsScraper update: %d added, %d updated" % (addcount, updatecount))
Exemple #7
0
    def update(self):
        logger.info("Starting LocalNewsScraper update %s" % self.url)

        try:
            schema = Schema.objects.get(slug=self.schema_slug)
        except Schema.DoesNotExist:
            logger.error("Schema (%s): DoesNotExist" % self.schema_slug)
            return 1

        response, content = self.http.request(self.url)
        if response.fromcache:
            logger.info("Feed is unchanged since last update (cached)")
            return

        f = feedparser.parse(content)
        addcount = updatecount = 0
        for entry in f.entries:
            title = convert_entities(entry.title)
            description = convert_entities(entry.description)

            if entry.id.startswith('http'):
                item_url = entry.id
            else:
                item_url = entry.link
            try:
                item = NewsItem.objects.get(schema__id=schema.id,
                                            title=title,
                                            description=description)
                #url=item_url)
                status = 'updated'
            except NewsItem.DoesNotExist:
                item = NewsItem()
                status = 'added'
            except NewsItem.MultipleObjectsReturned:
                # Seen some where we get the same story with multiple URLs. Why?
                logger.warn(
                    "Multiple entries matched title %r and description %r. Expected unique!"
                    % (title, description))
                continue
            try:
                item.title = title
                item.schema = schema
                item.description = description
                item.url = item_url
                # Support both georss and xcal for getting the location name.
                # TODO: should also support ev:location per http://web.resource.org/rss/1.0/modules/event/
                item.location_name = entry.get(
                    'xCal_x-calconnect-street') or entry.get(
                        'x-calconnect-street') or entry.get(
                            'georss_featurename') or entry.get('featurename')
                item.item_date = datetime.datetime(*entry.updated_parsed[:6])
                item.pub_date = datetime.datetime(*entry.updated_parsed[:6])
                _short_title = item.title[:30] + '...'

                # feedparser bug: depending on which parser it magically uses,
                # we either get the xml namespace in the key name, or we don't.
                point = entry.get('georss_point') or entry.get('point')
                x, y = None, None
                if point:
                    # GeoRSS puts latitude (Y) first.
                    y, x = point.split(' ')
                else:
                    if item.location_name:
                        text = item.location_name
                    else:
                        # Geocode whatever we can find.
                        text = item.title + ' ' + item.description
                    logger.debug("...Falling back on geocoding from %r..." %
                                 text[:50])
                    addrs = parse_addresses(text)
                    for addr, unused in addrs:
                        try:
                            result = SmartGeocoder().geocode(addr)
                            point = result['point']
                            logger.debug("internally geocoded %r" % addr)
                            x, y = point.x, point.y
                            if not item.location_name:
                                item.location_name = result['address']
                            break
                        except GeocodingException:
                            logger.debug("Geocoding exception on %r:" % text)
                            log_exception(level=logging.DEBUG)
                            continue
                        except:
                            logger.error(
                                'uncaught geocoder exception on %r\n' % addr)
                            log_exception()
                    if None in (x, y):
                        logger.debug(
                            "Skip, couldn't geocode any addresses in item '%s...'"
                            % _short_title)
                        continue
                item.location = Point((float(x), float(y)))
                if not intersects_metro_bbox(item.location):
                    reversed_loc = Point((float(y), float(x)))
                    if intersects_metro_bbox(reversed_loc):
                        logger.info(
                            "Got points in apparently reverse order, flipping them"
                        )
                        item.location = reversed_loc
                    else:
                        logger.info("Skipping %r as %s,%s is out of bounds" %
                                    (_short_title, y, x))
                        continue
                if not item.location_name:
                    # Fall back to reverse-geocoding.
                    from ebpub.geocoder import reverse
                    try:
                        block, distance = reverse.reverse_geocode(
                            item.location)
                        logger.debug(" Reverse-geocoded point to %r" %
                                     block.pretty_name)
                        item.location_name = block.pretty_name
                    except reverse.ReverseGeocodeError:
                        logger.info(
                            " Skip, failed to reverse geocode %s for %r" %
                            (item.location.wkt, _short_title))
                        continue
                item.save()
                if status == 'added':
                    addcount += 1
                else:
                    updatecount += 1
                logger.info("%s: %s" % (status, _short_title))
            except:
                logger.error("Warning: couldn't save %r. Traceback:" %
                             _short_title)
                log_exception()
        logger.info("Finished LocalNewsScraper update: %d added, %d updated" %
                    (addcount, updatecount))
        loc = item.location.centroid
        print "Found %r outside bounds at %s, %s" % (item.title, loc.x, loc.y)
    else:
        loc = None
        print "NO location on %s: %s" % (item.schema.slug, item.title)
    fixed = False
    if item.location_name:
        from ebpub.geocoder import SmartGeocoder, AmbiguousResult
        try:
            result = SmartGeocoder().geocode(item.location_name)
        except AmbiguousResult, e:
            print "...%d choices, picking the first one" % len(e.choices)
            result = e.choices[0]
        except:
            result = None
        if result and intersects_metro_bbox(result['point']):
            print "Fixing %r by geocoding %r" % (item.title,
                                                 item.location_name)
            item.location = result['point']
            fixed = True

    if loc and not fixed:
        newloc = Point(loc.y, loc.x)
        if intersects_metro_bbox(newloc):
            print "Fixing %r by flipping bounds" % item.title
            item.location = newloc
            fixed = True

    if fixed:
        if not dry_run:
            print "saving %s" % item