def download_state_shapefile(state, zipcodes):
    print "Starting download"
    n = dict(CENSUS_STATES)[state].upper().replace(' ', '_')
    name         = "tl_2009_%s_zcta5" % state
    zip_filename = "%s.zip" % name
    cache_dir    = getattr(settings, 'HTTP_CACHE', tempfile.gettempdir())
    path         = os.path.join(cache_dir, zip_filename)
    url = "http://tigerline.census.gov/geo/tiger/TIGER2009/%(id)s_%(name)s/%(zip_filename)s" % { 'id': state, 'name': n, 'zip_filename': zip_filename }

    Retriever().cached_get_to_file(url, path)
    print "fetched %s" % url
    files = [name + ext for ext in ('.shp', '.dbf', '.prj', '.shp.xml', '.shx')]
    shapefile = os.path.join(cache_dir, '%s.shp' % name)
    # TODO: handle corrupt/incomplete/missing files zipfile
    # expected files aren't in the archive ...)
    try:
        ZipFile(path, 'r').extractall(cache_dir, files)
        print "extracted"
    except:
        log_exception()
        return
    for zipcode in zipcodes:
        print "importing %s" % zipcode
        import_zip_from_shapefile(shapefile, zipcode)
        print "... ok"
    print "All zip codes done"
Exemple #2
0
def download_state_shapefile(state, zipcodes):
    print "Starting download"
    n = dict(CENSUS_STATES)[state].upper().replace(' ', '_')
    name = "tl_2009_%s_zcta5" % state
    zip_filename = "%s.zip" % name
    cache_dir = getattr(settings, 'HTTP_CACHE', tempfile.gettempdir())
    path = os.path.join(cache_dir, zip_filename)
    url = "http://tigerline.census.gov/geo/tiger/TIGER2009/%(id)s_%(name)s/%(zip_filename)s" % {
        'id': state,
        'name': n,
        'zip_filename': zip_filename
    }

    Retriever().cached_get_to_file(url, path)
    print "fetched %s" % url
    files = [
        name + ext for ext in ('.shp', '.dbf', '.prj', '.shp.xml', '.shx')
    ]
    shapefile = os.path.join(cache_dir, '%s.shp' % name)
    # TODO: handle corrupt/incomplete/missing files zipfile
    # expected files aren't in the archive ...)
    try:
        ZipFile(path, 'r').extractall(cache_dir, files)
        print "extracted"
    except:
        log_exception()
        return
    for zipcode in zipcodes:
        print "importing %s" % zipcode
        import_zip_from_shapefile(shapefile, zipcode)
        print "... ok"
    print "All zip codes done"
def import_zip_from_shapefile(filename, zipcode):
    layer = layer_from_shapefile(filename, 0)
    importer = ZipImporter(layer, 'ZCTA5CE')
    try:
        importer.import_zip(zipcode)
    except:
        log_exception()
        return
Exemple #4
0
def import_zip_from_shapefile(filename, zipcode):
    layer = layer_from_shapefile(filename, 0)
    importer = ZipImporter(layer, 'ZCTA5CE')
    try:
        importer.import_zip(zipcode)
    except:
        log_exception()
        return
    def save(self, old_record, list_record, detail_record):
        # This gets called once all parsing and cleanup is done.
        # It looks a lot like our 'expedient hack' code above.

        # We can ignore detail_record since has_detail is False.

        date = datetime.date(*list_record['updated_parsed'][:3])
        description = text_from_html(list_record['summary'])

        # This feed doesn't provide geographic data; we'll try to
        # extract addresses from the text, and stop on the first
        # one that successfully geocodes.
        # First we'll need some suitable text; throw away HTML tags.
       # full_description = list_record['content'][0]['value']
       # full_description = text_from_html(full_description)
        grabber = places.location_grabber()

        addrs = grabber(description)
       # printing articles title for debugging
       # print list_record['title']

        if not addrs:
            addrs = grabber(list_record['title'])
            if not addrs:
                self.logger.info("no addresses found")
                return

        location = None
        location_name = u''
        block = None
        # Ready to geocode. If we had one location_name to try,
        # this could be done automatically in create_or_update(), but
        # we have multiple possible location_names.
        for l, r, name in addrs:
            #addr = addr.strip()
            try:
                locationSyn = LocationSynonym.objects.get(pretty_name = name)
                location = Location.objects.get(name = locationSyn.location).location
            except GeocodingException:
                log_exception(level=logging.DEBUG)
                continue
            location_name = name
           # block = location['block']
           # location = location['point']
            break
        if location is None:
            self.logger.info("no addresses geocoded in %r" % list_record['title'])
            return

        kwargs = dict(item_date=date,
                      location=location,
                      location_name=location_name,
                      description=description,
                      title=list_record['title'],
                      url=list_record['link'],
                      )
        attributes = None
        self.create_or_update(old_record, attributes, **kwargs)
def template_context_for_item(newsitem, widget=None):
    # try to make something ... reasonable for use in
    # templates.
    ctx = {
        'attributes': [],
        'attributes_by_name': {},
        '_item': newsitem,  # cached in case downstream code really needs it.
    }
    for att in newsitem.attributes_for_template():

        attr = {
            'name': att.sf.name,
            'title': att.sf.smart_pretty_name(),
            'display': att.sf.display
        }

        vals = [x['value'] for x in att.value_list()]
        if len(vals) == 1:
            attr['value'] = vals[0]
            attr['is_list'] = False
        else:
            attr['value'] = vals
            attr['is_list'] = True
        ctx['attributes'].append(attr)
        ctx['attributes_by_name'][att.sf.name] = attr

    # newsitem fields
    ctx['id'] = newsitem.id
    ctx['schema'] = newsitem.schema
    ctx['title'] = newsitem.title
    ctx['description'] = newsitem.description
    ctx['pub_date'] = newsitem.pub_date
    ctx['item_date'] = newsitem.item_date
    ctx['location'] = {}
    if newsitem.location:
        # TODO: Is centroid really what we want for non-Point geometries?
        ctx['location']['lon'] = newsitem.location.centroid.x
        ctx['location']['lat'] = newsitem.location.centroid.y
        ctx['location']['geom'] = newsitem.location

    ctx['location']['name'] = newsitem.location_name

    ctx['external_url'] =  newsitem.url
    if newsitem.schema.has_newsitem_detail:
        ctx['internal_url'] = 'http://' + settings.EB_DOMAIN + newsitem.item_url()

    if widget is not None:
        if widget.item_link_template and widget.item_link_template.strip():
            try:
                ctx['internal_url'] = _eval_item_link_template(widget.item_link_template,
                                                               {'item': ctx, 'widget': widget})
            except:
                log_exception()
                # TODO: some sort of error handling
                return '#error'

    return ctx
def unzip(filename, cwd=None):
    """Unzip filename, write extracted files into cwd (default is the current dir).
    """
    try:
        zfile = zipfile.ZipFile(filename)
        zfile.extractall(path=cwd)
        return True
    except:
        log_exception()
        return False
Exemple #8
0
def unzip(filename, cwd=None):
    """Unzip filename, write extracted files into cwd (default is the current dir).
    """
    try:
        zfile = zipfile.ZipFile(filename)
        zfile.extractall(path=cwd)
        return True
    except:
        log_exception()
        return False
def makedirs(path):
    """Emulates the `mkdir -p` shell command.
    """
    if os.path.exists(path):
        return True
    try:
        os.makedirs(path)
        return True
    except:
        log_exception()
        return False
Exemple #10
0
def makedirs(path):
    """Emulates the `mkdir -p` shell command.
    """
    if os.path.exists(path):
        return True
    try:
        os.makedirs(path)
        return True
    except:
        log_exception()
        return False
 def update(self):
     """ Download Calendar RSS feed and update database """
     logger.info("Starting ObituaryScraper")
     feed = feedparser.parse(self.url)
     total_created = 0
     for entry in feed.entries:
         title = convert_entities(entry.title)
         try:
             created = self.parse_entry(entry, title)
             if created:
                 total_created += 1
         except:
             logger.error("unexpected error:", sys.exc_info()[1])
             log_exception()
             break
     logger.info("Created %d of %d total" % (created, len(feed.entries)))
Exemple #12
0
def import_blocks_from_shapefiles(edges,
                                  featnames,
                                  faces,
                                  place,
                                  city=None,
                                  fix_cities=False,
                                  regenerate_intersections=True):
    # File args are paths to zip files.

    outdir = mkdtemp(suffix='-block-shapefiles')
    try:
        for path in (edges, featnames, faces, place):
            ZipFile(path, 'r').extractall(outdir)
    except:
        # TODO: display error in UI
        log_exception()
        shutil.rmtree(outdir)
        raise
    finally:
        os.unlink(edges)
        os.unlink(featnames)
        os.unlink(faces)
        os.unlink(place)
    try:
        edges = glob.glob(os.path.join(outdir, '*edges.shp'))[0]
        featnames = glob.glob(os.path.join(outdir, '*featnames.dbf'))[0]
        faces = glob.glob(os.path.join(outdir, '*faces.dbf'))[0]
        place = glob.glob(os.path.join(outdir, '*place.shp'))[0]
        tiger = TigerImporter(
            edges,
            featnames,
            faces,
            place,
            filter_city=city,
            fix_cities=fix_cities,
        )
        num_created = tiger.save()
    finally:
        shutil.rmtree(outdir)
    if regenerate_intersections:
        populate_streets_task()
    return num_created
def import_blocks_from_shapefiles(edges, featnames, faces, place, city=None,
                                  fix_cities=False, regenerate_intersections=True):
    # File args are paths to zip files.

    outdir = mkdtemp(suffix='-block-shapefiles')
    try:
        for path in (edges, featnames, faces, place):
            ZipFile(path, 'r').extractall(outdir)
    except:
        # TODO: display error in UI
        log_exception()
        shutil.rmtree(outdir)
        raise
    finally:
        os.unlink(edges)
        os.unlink(featnames)
        os.unlink(faces)
        os.unlink(place)
    try:
        edges = glob.glob(os.path.join(outdir, '*edges.shp'))[0]
        featnames = glob.glob(os.path.join(outdir, '*featnames.dbf'))[0]
        faces = glob.glob(os.path.join(outdir, '*faces.dbf'))[0]
        place = glob.glob(os.path.join(outdir, '*place.shp'))[0]
        tiger = TigerImporter(
            edges,
            featnames,
            faces,
            place,
            filter_city=city,
            fix_cities=fix_cities,
            )
        num_created = tiger.save()
    finally:
        shutil.rmtree(outdir)
    if regenerate_intersections:
        populate_streets_task()
    return num_created
Exemple #14
0
            
            try:
                place.pretty_name = pretty_name
                place.address = address
                place.location = point
                place.url = place_url
                place.place_type = place_type
                place.save()

                if created: 
                    message = 'Created new place %s' % (pretty_name)
                else: 
                    message = 'Updated place %s' % (pretty_name)
                context['actions_taken'].append(message)
            except: 
                log_exception()
                message = 'Error adding place "%s"' % pretty_name
                context['errors'].append(message)
                continue

                
            # now update Synonyms

            # destroy synonyms not in the new list, identify new synonyms
            new_synonyms = set(synonyms)
            for synonym in PlaceSynonym.objects.filter(place=place).all():
                if synonym.pretty_name not in new_synonyms:
                    synonym.delete()
                    message = 'Removing old synonym "%s" for "%s"' % (synonym.pretty_name, pretty_name)
                    context['actions_taken'].append(message)
                else:
Exemple #15
0
def template_context_for_item(newsitem, widget=None):
    # try to make something ... reasonable for use in
    # templates.
    ctx = {
        'attributes': [],
        'attributes_by_name': {},
        '_item': newsitem,  # cached in case downstream code really needs it.
    }
    for att in newsitem.attributes_for_template():

        attr = {
            'name': att.sf.name,
            'title': att.sf.smart_pretty_name(),
            'display': att.sf.display
        }

        vals = [x['value'] for x in att.value_list()]
        if len(vals) == 1:
            attr['value'] = vals[0]
            attr['is_list'] = False
        else:
            attr['value'] = vals
            attr['is_list'] = True
        ctx['attributes'].append(attr)
        ctx['attributes_by_name'][att.sf.name] = attr

    # newsitem fields
    ctx['id'] = newsitem.id
    ctx['schema'] = newsitem.schema
    ctx['title'] = newsitem.title
    ctx['description'] = newsitem.description
    ctx['pub_date'] = newsitem.pub_date
    ctx['item_date'] = newsitem.item_date
    ctx['location'] = {}
    if newsitem.location:
        # TODO: Is centroid really what we want for non-Point geometries?
        ctx['location']['lon'] = newsitem.location.centroid.x
        ctx['location']['lat'] = newsitem.location.centroid.y
        ctx['location']['geom'] = newsitem.location

    ctx['location']['name'] = newsitem.location_name

    ctx['external_url'] = newsitem.url
    if newsitem.schema.has_newsitem_detail:
        ctx['internal_url'] = 'http://' + settings.EB_DOMAIN + newsitem.item_url(
        )

    if widget is not None:
        if widget.item_link_template and widget.item_link_template.strip():
            try:
                ctx['internal_url'] = _eval_item_link_template(
                    widget.item_link_template, {
                        'item': ctx,
                        'widget': widget
                    })
            except:
                log_exception()
                # TODO: some sort of error handling
                return '#error'

    return ctx
Exemple #16
0
    def update(self):
        logger.info("Starting LocalNewsScraper update %s" % self.url)

        try:
            schema = Schema.objects.get(slug=self.schema_slug)
        except Schema.DoesNotExist:
            logger.error( "Schema (%s): DoesNotExist" % self.schema_slug)
            return 1

        response, content = self.http.request(self.url)
        if response.fromcache:
            logger.info("Feed is unchanged since last update (cached)")
            return

        f = feedparser.parse(content)
        addcount = updatecount = 0
        for entry in f.entries:
            title = convert_entities(entry.title)
            description = convert_entities(entry.description)

            if entry.id.startswith('http'):
                item_url = entry.id
            else:
                item_url = entry.link
            try:
                item = NewsItem.objects.get(schema__id=schema.id,
                                            title=title,
                                            description=description)
                #url=item_url)
                status = 'updated'
            except NewsItem.DoesNotExist:
                item = NewsItem()
                status = 'added'
            except NewsItem.MultipleObjectsReturned:
                # Seen some where we get the same story with multiple URLs. Why?
                logger.warn("Multiple entries matched title %r and description %r. Expected unique!" % (title, description))
                continue
            try:
                item.title = title
                item.schema = schema
                item.description = description
                item.url = item_url
                # Support both georss and xcal for getting the location name.
                # TODO: should also support ev:location per http://web.resource.org/rss/1.0/modules/event/
                item.location_name = entry.get('xCal_x-calconnect-street') or entry.get('x-calconnect-street') or entry.get('georss_featurename') or entry.get('featurename')
                item.item_date = datetime.datetime(*entry.updated_parsed[:6])
                item.pub_date = datetime.datetime(*entry.updated_parsed[:6])
                _short_title = item.title[:30] + '...'

                # feedparser bug: depending on which parser it magically uses,
                # we either get the xml namespace in the key name, or we don't.
                point = entry.get('georss_point') or entry.get('point')
                x, y = None, None
                if point:
                    # GeoRSS puts latitude (Y) first.
                    y, x = point.split(' ')
                else:
                    if item.location_name:
                        text = item.location_name
                    else:
                        # Geocode whatever we can find.
                        text = item.title + ' ' + item.description
                    logger.debug("...Falling back on geocoding from %r..." % text[:50])
                    addrs = parse_addresses(text)
                    for addr, unused in addrs:
                        try:
                            result = SmartGeocoder().geocode(addr)
                            point = result['point']
                            logger.debug("internally geocoded %r" % addr)
                            x, y = point.x, point.y
                            if not item.location_name:
                                item.location_name = result['address']
                            item.block = result['block']
                            break
                        except GeocodingException:
                            logger.debug("Geocoding exception on %r:" % text)
                            log_exception(level=logging.DEBUG)
                            continue
                        except:
                            logger.error('uncaught geocoder exception on %r\n' % addr)
                            log_exception()
                    if None in (x, y):
                        logger.debug("Skip, couldn't geocode any addresses in item '%s...'"
                                     % _short_title)
                        continue
                item.location = Point((float(x), float(y)))
                if not intersects_metro_bbox(item.location):
                    reversed_loc = Point((float(y), float(x)))
                    if intersects_metro_bbox(reversed_loc):
                        logger.info(
                            "Got points in apparently reverse order, flipping them")
                        item.location = reversed_loc
                    else:
                        logger.info("Skipping %r as %s,%s is out of bounds" %
                                    (_short_title, y, x))
                        continue
                if not item.location_name:
                    # Fall back to reverse-geocoding.
                    from ebpub.geocoder import reverse
                    try:
                        block, distance = reverse.reverse_geocode(item.location)
                        logger.debug(" Reverse-geocoded point to %r" % block.pretty_name)
                        item.location_name = block.pretty_name
                        item.block = block
                    except reverse.ReverseGeocodeError:
                        logger.info(" Skip, failed to reverse geocode %s for %r" % (item.location.wkt, _short_title))
                        continue
                item.save()
                if status == 'added':
                    addcount += 1
                else:
                    updatecount += 1
                logger.info("%s: %s" % (status, _short_title))
            except:
                logger.error("Warning: couldn't save %r. Traceback:" % _short_title)
                log_exception()
        logger.info("Finished LocalNewsScraper update: %d added, %d updated" % (addcount, updatecount))
Exemple #17
0
    def update(self):
        """ Download Calendar RSS feed and update database """
        logger.info("Starting EventsCalendarScraper")
        
        feed = feedparser.parse(self.url)
        seencount = addcount = updatecount = 0
        for entry in feed.entries:

            def ns_get(element):
                # work around feedparser unpredictability.
                namespace, element = element.split(':')
                result = entry.get('%s_%s' % (namespace, element))
                if result is None:
                    result = entry.get(element)
                return result

            seencount += 1
            title = convert_entities(entry.title)
            try:
                item = NewsItem.objects.get(title=title,
                                            schema__id=self.schema.id)
                status = "updated"
            except NewsItem.DoesNotExist:
                item = NewsItem()
                status = "added"
            except NewsItem.MultipleObjectsReturned:
                logger.warn("Multiple entries matched title %r, event titles are not unique?" % title)
                continue
            try:
                item.location_name = '%s %s' % (ns_get('xcal:x-calconnect-venue-name'),
                                                ns_get('xcal:x-calconnect-street'))
                item.location_name = item.location_name.strip()
                item.schema = self.schema
                item.title = title
                item.description = convert_entities(entry.description)
                item.url = entry.link
                start_dt = ns_get('xcal:dtstart')
                start_dt = dateutil.parser.parse(start_dt)
                # Upstream bug: They provide a UTC offset of +0000 which
                # means times in UTC, but they're actually times in
                # US/Eastern, so do *not* fix the zone.
                #start_dt = start_dt.astimezone(local_tz)
                item.item_date = start_dt.date()
                item.pub_date = datetime.datetime(*entry.updated_parsed[:6])
                item.location = Point((float(ns_get('geo:long')),
                                       float(ns_get('geo:lat'))))
                if (item.location.x, item.location.y) == (0.0, 0.0):
                    logger.warn("Skipping %r, bad location 0,0" % item.title)
                    continue

                if not item.location_name:
                    # Fall back to reverse-geocoding.
                    from ebpub.geocoder import reverse
                    try:
                        block, distance = reverse.reverse_geocode(item.location)
                        logger.info(" Reverse-geocoded point to %r" % block.pretty_name)
                        item.location_name = block.pretty_name
                        item.block = block
                    except reverse.ReverseGeocodeError:
                        logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title))
                        item.location_name = u''

                item.save()
                item.attributes['start_time'] = start_dt.time()
                end_dt = ns_get('xcal:dtend') or u''
                if end_dt.strip():
                    end_dt = dateutil.parser.parse(end_dt.strip())
                    #end_dt = end_dt.astimezone(local_tz)
                    item.attributes['end_time'] = end_dt.time()
                if status == 'added':
                    addcount += 1
                else:
                    updatecount += 1
                logger.info("%s: %s" % (status, item.title))
            except:
                logger.error("unexpected error:", sys.exc_info()[1])
                log_exception()
        logger.info("EventsCalendarScraper finished: %d added, %d updated of %s total" % (addcount, updatecount, seencount))
Exemple #18
0
    def save(self, old_record, list_record, detail_record):
        # TODO: move some of this to clean_list_record?
        date = datetime.date(*list_record['updated_parsed'][:3])

        # Get the precinct from the tags.
        precincts = [
            'A1', 'A7', 'B2', 'B3', 'C11', 'C6', 'D14', 'D4', 'E13', 'E18',
            'E5'
        ]
        precinct = None
        tags = [t['term'] for t in list_record['tags']]
        if not tags:
            return

        for precinct in tags:
            if precinct in precincts:
                # TODO: we need a LocationType for precincts, and shapes; and
                # then we could set newsitem.location_object to the Location
                # for this precinct.
                break

        if not precinct:
            self.logger.debug("no precinct found in tags %r" % tags)

        description = list_record['summary']

        full_description = list_record['content'][0]['value']
        full_description = text_from_html(full_description)

        addrs = parse_addresses(full_description)
        if not addrs:
            self.logger.info("no addresses found in %r %r" %
                             (list_record['title'], list_record['link']))
            return

        location = None
        location_name = u''
        block = None

        # This feed doesn't provide geographic data; we'll try to
        # extract addresses from the text, and stop on the first
        # one that successfully geocodes.
        for addr, unused in addrs:
            addr = addr.strip()
            try:
                location = SmartGeocoder().geocode(addr)
            except (GeocodingException, ParsingError):
                log_exception(level=logging.DEBUG)
                continue
            location_name = location['address']
            location = location['point']
            break
        else:
            self.logger.info("no addresses geocoded in %r" %
                             list_record['title'])
            return

        kwargs = dict(
            item_date=date,
            location=location,
            location_name=location_name,
            title=list_record['title'],
            description=description,
            url=list_record['link'],
        )
        attributes = None
        self.create_or_update(old_record, attributes, **kwargs)
Exemple #19
0
    def update(self):
        logger.info("Starting LocalNewsScraper update %s" % self.url)

        try:
            schema = Schema.objects.get(slug=self.schema_slug)
        except Schema.DoesNotExist:
            logger.error("Schema (%s): DoesNotExist" % self.schema_slug)
            return 1

        response, content = self.http.request(self.url)
        if response.fromcache:
            logger.info("Feed is unchanged since last update (cached)")
            return

        f = feedparser.parse(content)
        addcount = updatecount = 0
        for entry in f.entries:
            title = convert_entities(entry.title)
            description = convert_entities(entry.description)

            if entry.id.startswith('http'):
                item_url = entry.id
            else:
                item_url = entry.link
            try:
                item = NewsItem.objects.get(schema__id=schema.id,
                                            title=title,
                                            description=description)
                #url=item_url)
                status = 'updated'
            except NewsItem.DoesNotExist:
                item = NewsItem()
                status = 'added'
            except NewsItem.MultipleObjectsReturned:
                # Seen some where we get the same story with multiple URLs. Why?
                logger.warn(
                    "Multiple entries matched title %r and description %r. Expected unique!"
                    % (title, description))
                continue
            try:
                item.title = title
                item.schema = schema
                item.description = description
                item.url = item_url
                # Support both georss and xcal for getting the location name.
                # TODO: should also support ev:location per http://web.resource.org/rss/1.0/modules/event/
                item.location_name = entry.get(
                    'xCal_x-calconnect-street') or entry.get(
                        'x-calconnect-street') or entry.get(
                            'georss_featurename') or entry.get('featurename')
                item.item_date = datetime.datetime(*entry.updated_parsed[:6])
                item.pub_date = datetime.datetime(*entry.updated_parsed[:6])
                _short_title = item.title[:30] + '...'

                # feedparser bug: depending on which parser it magically uses,
                # we either get the xml namespace in the key name, or we don't.
                point = entry.get('georss_point') or entry.get('point')
                x, y = None, None
                if point:
                    # GeoRSS puts latitude (Y) first.
                    y, x = point.split(' ')
                else:
                    if item.location_name:
                        text = item.location_name
                    else:
                        # Geocode whatever we can find.
                        text = item.title + ' ' + item.description
                    logger.debug("...Falling back on geocoding from %r..." %
                                 text[:50])
                    addrs = parse_addresses(text)
                    for addr, unused in addrs:
                        try:
                            result = SmartGeocoder().geocode(addr)
                            point = result['point']
                            logger.debug("internally geocoded %r" % addr)
                            x, y = point.x, point.y
                            if not item.location_name:
                                item.location_name = result['address']
                            break
                        except GeocodingException:
                            logger.debug("Geocoding exception on %r:" % text)
                            log_exception(level=logging.DEBUG)
                            continue
                        except:
                            logger.error(
                                'uncaught geocoder exception on %r\n' % addr)
                            log_exception()
                    if None in (x, y):
                        logger.debug(
                            "Skip, couldn't geocode any addresses in item '%s...'"
                            % _short_title)
                        continue
                item.location = Point((float(x), float(y)))
                if not intersects_metro_bbox(item.location):
                    reversed_loc = Point((float(y), float(x)))
                    if intersects_metro_bbox(reversed_loc):
                        logger.info(
                            "Got points in apparently reverse order, flipping them"
                        )
                        item.location = reversed_loc
                    else:
                        logger.info("Skipping %r as %s,%s is out of bounds" %
                                    (_short_title, y, x))
                        continue
                if not item.location_name:
                    # Fall back to reverse-geocoding.
                    from ebpub.geocoder import reverse
                    try:
                        block, distance = reverse.reverse_geocode(
                            item.location)
                        logger.debug(" Reverse-geocoded point to %r" %
                                     block.pretty_name)
                        item.location_name = block.pretty_name
                    except reverse.ReverseGeocodeError:
                        logger.info(
                            " Skip, failed to reverse geocode %s for %r" %
                            (item.location.wkt, _short_title))
                        continue
                item.save()
                if status == 'added':
                    addcount += 1
                else:
                    updatecount += 1
                logger.info("%s: %s" % (status, _short_title))
            except:
                logger.error("Warning: couldn't save %r. Traceback:" %
                             _short_title)
                log_exception()
        logger.info("Finished LocalNewsScraper update: %d added, %d updated" %
                    (addcount, updatecount))
Exemple #20
0
            try:
                place.pretty_name = pretty_name
                place.address = address
                place.location = point
                place.url = place_url
                place.place_type = place_type
                place.save()

                if created:
                    message = 'Created new place %s' % (pretty_name)
                else:
                    message = 'Updated place %s' % (pretty_name)
                context['actions_taken'].append(message)
            except:
                log_exception()
                message = 'Error adding place "%s"' % pretty_name
                context['errors'].append(message)
                continue

            # now update Synonyms

            # destroy synonyms not in the new list, identify new synonyms
            new_synonyms = set(synonyms)
            for synonym in PlaceSynonym.objects.filter(place=place).all():
                if synonym.pretty_name not in new_synonyms:
                    synonym.delete()
                    message = 'Removing old synonym "%s" for "%s"' % (
                        synonym.pretty_name, pretty_name)
                    context['actions_taken'].append(message)
                else:
Exemple #21
0
    def update(self):
        """ Download Calendar RSS feed and update database """
        logger.info("Starting EventsCalendarScraper")
        
        feed = feedparser.parse(self.url)
        seencount = addcount = updatecount = 0
        for entry in feed.entries:

            def ns_get(element):
                # work around feedparser unpredictability.
                namespace, element = element.split(':')
                result = entry.get('%s_%s' % (namespace, element))
                if result is None:
                    result = entry.get(element)
                return result

            seencount += 1
            title = convert_entities(entry.title)
            try:
                item = NewsItem.objects.get(title=title,
                                            schema__id=self.schema.id)
                status = "updated"
            except NewsItem.DoesNotExist:
                item = NewsItem()
                status = "added"
            except NewsItem.MultipleObjectsReturned:
                logger.warn("Multiple entries matched title %r, event titles are not unique?" % title)
                continue
            try:
                item.location_name = '%s %s' % (ns_get('xcal:x-calconnect-venue-name'),
                                                ns_get('xcal:x-calconnect-street'))
                item.location_name = item.location_name.strip()
                item.schema = self.schema
                item.title = title
                item.description = convert_entities(entry.description)
                item.url = entry.link
                start_dt = ns_get('xcal:dtstart')
                start_dt = dateutil.parser.parse(start_dt)
                # Upstream bug: They provide a UTC offset of +0000 which
                # means times in UTC, but they're actually times in
                # US/Eastern, so do *not* fix the zone.
                #start_dt = start_dt.astimezone(local_tz)
                item.item_date = start_dt.date()
                item.pub_date = datetime.datetime(*entry.updated_parsed[:6])
                item.location = Point((float(ns_get('geo:long')),
                                       float(ns_get('geo:lat'))))
                if (item.location.x, item.location.y) == (0.0, 0.0):
                    logger.warn("Skipping %r, bad location 0,0" % item.title)
                    continue

                if not item.location_name:
                    # Fall back to reverse-geocoding.
                    from ebpub.geocoder import reverse
                    try:
                        block, distance = reverse.reverse_geocode(item.location)
                        logger.info(" Reverse-geocoded point to %r" % block.pretty_name)
                        item.location_name = block.pretty_name
                    except reverse.ReverseGeocodeError:
                        logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title))
                        item.location_name = u''

                item.save()
                item.attributes['start_time'] = start_dt.time()
                end_dt = ns_get('xcal:dtend') or u''
                if end_dt.strip():
                    end_dt = dateutil.parser.parse(end_dt.strip())
                    #end_dt = end_dt.astimezone(local_tz)
                    item.attributes['end_time'] = end_dt.time()
                if status == 'added':
                    addcount += 1
                else:
                    updatecount += 1
                logger.info("%s: %s" % (status, item.title))
            except:
                logger.error("unexpected error:", sys.exc_info()[1])
                log_exception()
        logger.info("EventsCalendarScraper finished: %d added, %d updated of %s total" % (addcount, updatecount, seencount))
Exemple #22
0
def update():
    """ Download Calendar RSS feed and update database """
    logger.info("Starting add_events")
    url = """http://calendar.boston.com/search?acat=&cat=&commit=Search\
&new=n&rss=1&search=true&sort=0&srad=20&srss=50&ssrss=5&st=event\
&st_select=any&svt=text&swhat=&swhen=today&swhere=&trim=1"""
    schema = 'events'

    try:
        schema = Schema.objects.get(slug=schema)
    except Schema.DoesNotExist:
        logger.error("Schema (%s): DoesNotExist" % schema)
        sys.exit(1)

    feed = feedparser.parse(url)
    addcount = updatecount = 0
    for entry in feed.entries:
        title = convert_entities(entry.title)
        try:
            item = NewsItem.objects.get(title=title, schema__id=schema.id)
            status = "updated"
        except NewsItem.DoesNotExist:
            item = NewsItem()
            status = "added"
        except NewsItem.MultipleObjectsReturned:
            logger.warn(
                "Multiple entries matched title %r, event titles are not unique?"
                % title)
            continue
        try:
            item.location_name = entry.get(
                'xcal_x-calconnect-street') or entry.get(
                    'x-calconnect-street') or u''
            item.schema = schema
            item.title = title
            item.description = convert_entities(entry.description)
            item.url = entry.link
            item.item_date = datetime.datetime(*entry.updated_parsed[:6])
            item.pub_date = datetime.datetime(*entry.updated_parsed[:6])
            item.location = Point(
                (float(entry['geo_long']), float(entry['geo_lat'])))
            if (item.location.x, item.location.y) == (0.0, 0.0):
                logger.warn("Skipping %r, bad location 0,0" % item.title)
                continue

            if not item.location_name:
                # Fall back to reverse-geocoding.
                from ebpub.geocoder import reverse
                try:
                    block, distance = reverse.reverse_geocode(item.location)
                    logger.info(" Reverse-geocoded point to %r" %
                                block.pretty_name)
                    item.location_name = block.pretty_name
                except reverse.ReverseGeocodeError:
                    logger.debug(" Failed to reverse geocode %s for %r" %
                                 (item.location.wkt, item.title))
                    item.location_name = u''

            item.save()
            if status == 'added':
                addcount += 1
            else:
                updatecount += 1
            logger.info("%s: %s" % (status, item.title))
        except:
            logger.error("unexpected error:", sys.exc_info()[1])
            log_exception()
    logger.info("add_events finished: %d added, %d updated" %
                (addcount, updatecount))
def search(request, schema_slug=''):
    "Performs a location search and redirects to the address/xy page."
    # Check whether a schema was provided.
    if schema_slug:
        try:
            schema = get_schema_manager(request).get(slug=schema_slug)
        except Schema.DoesNotExist:
            raise Http404('Schema does not exist')
        url_prefix = schema.url()[:-1]
    else:
        schema = None
        url_prefix = ''

    # Get the query.
    q = request.GET.get('q', '').strip()
    if not q:
        return HttpResponseRedirect(url_prefix + '/') # TODO: Do something better than redirecting.

    # For /search/?type=alert, we redirect results to the alert page, not the
    # place page.
    if request.GET.get('type', '') == 'alert':
        url_method = 'alert_url'
    else:
        url_method = 'url'

    # Try to geocode it using full_geocode().
    try:
        result = full_geocode(q, search_places=False)
    except:
        logger.debug('Unhandled exception from full_geocode:')
        log_exception(level=logging.DEBUG, logger=logger)
    else:
        if result['ambiguous']:
            if result['type'] == 'block':
                streets = []
                street_blocks = {}
                for block in result['result']:
                    street_name = block.street_pretty_name
                    if street_name not in streets:
                        streets.append(street_name)
                        street_blocks[street_name] = []
                    street_blocks[street_name].append(block)

                choices = [{'name': s, 'blocks': street_blocks[s]} for s in streets]
                return eb_render(request, 'db/search_invalid_block.html', {
                    'query': q,
                    'choices': choices,
                })
            else:
                return eb_render(request, 'db/did_you_mean.html', {'query': q, 'choices': result['result']})
        elif result['type'] == 'location':
            return HttpResponseRedirect(url_prefix + getattr(result['result'], url_method)())
        elif result['type'] == 'address':
            # Block
            if result['result']['block']:
                return HttpResponseRedirect(url_prefix + getattr(result['result']['block'], url_method)())
            # Intersection
            try:
                intersection = Intersection.objects.get(id=result['result']['intersection_id'])
            except Intersection.DoesNotExist:
                pass
            else:
                return HttpResponseRedirect(url_prefix + getattr(intersection, url_method)())

    # Failing the geocoding, look in the special-case table.
    try:
        special_case = SearchSpecialCase.objects.get(query=normalize(q))
    except SearchSpecialCase.DoesNotExist:
        pass
    else:
        if special_case.redirect_to:
            return HttpResponseRedirect(special_case.redirect_to)
        else:
            return eb_render(request, 'db/search_special_case.html', {'query': q, 'special_case': special_case})

    # Failing that, display a list of ZIP codes if this looks like a ZIP.
    if re.search(r'^\s*\d{5}(?:-\d{4})?\s*$', q):
        z_list = Location.objects.filter(location_type__slug='zipcodes', is_public=True).select_related().order_by('name')
        if z_list:
            return eb_render(request, 'db/search_error_zip_list.html', {'query': q, 'zipcode_list': z_list})

    # Failing all of that, display the search error page.
    lt_list = LocationType.objects.filter(is_significant=True).order_by('name')
    return eb_render(request, 'db/search_error.html', {'query': q, 'locationtype_list': lt_list})
Exemple #24
0
def update(argv=None):
    logger.info("Starting add_news")
    if argv:
        url = argv[0]
    else:
        url = "http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=massachusetts&scope=bonzai"
    schema_slug = "local-news"

    try:
        schema = Schema.objects.get(slug=schema_slug)
    except Schema.DoesNotExist:
        logger.error("Schema (%s): DoesNotExist" % schema_slug)
        sys.exit(1)

    f = feedparser.parse(url)
    addcount = updatecount = 0
    for entry in f.entries:
        title = convert_entities(entry.title)
        description = convert_entities(entry.description)

        if entry.id.startswith("http"):
            item_url = entry.id
        else:
            item_url = entry.link
        try:
            item = NewsItem.objects.get(schema__id=schema.id, title=title, description=description)
            # url=item_url)
            status = "updated"
        except NewsItem.DoesNotExist:
            item = NewsItem()
            status = "added"
        except NewsItem.MultipleObjectsReturned:
            # Seen some where we get the same story with multiple URLs. Why?
            logger.warn("Multiple entries matched title %r and description %r. Expected unique!" % (title, description))
            continue
        try:
            item.title = title
            item.schema = schema
            item.description = description
            item.url = item_url
            item.location_name = entry.get("x-calconnect-street") or entry.get("georss_featurename")
            item.item_date = datetime.datetime(*entry.updated_parsed[:6])
            item.pub_date = datetime.datetime(*entry.updated_parsed[:6])

            # feedparser bug: depending on which parser it magically uses,
            # we either get the xml namespace in the key name, or we don't.
            point = entry.get("georss_point") or entry.get("point")
            x, y = None, None
            if point:
                x, y = point.split(" ")
            if True:
                # Fall back on geocoding.
                text = item.title + " " + item.description
                addrs = parse_addresses(text)
                for addr, unused in addrs:
                    try:
                        result = SmartGeocoder().geocode(addr)
                        point = result["point"]
                        logger.debug("internally geocoded %r" % addr)
                        x, y = point.x, point.y
                        break
                    except GeocodingException:
                        logger.debug("Geocoding exception on %r:" % text)
                        log_exception(level=logging.DEBUG)
                        continue
                    except:
                        logger.error("uncaught geocoder exception on %r\n" % addr)
                        log_exception()
                if None in (x, y):
                    logger.info("couldn't geocode '%s...'" % item.title[:30])
                    continue
            item.location = Point((float(y), float(x)))
            if item.location.x == 0.0 and item.location.y == 0.0:
                # There's a lot of these. Maybe attempt to
                # parse and geocode if we haven't already?
                logger.info("Skipping %r as it has bad location 0,0" % item.title)
                continue
            if not item.location_name:
                # Fall back to reverse-geocoding.
                from ebpub.geocoder import reverse

                try:
                    block, distance = reverse.reverse_geocode(item.location)
                    logger.debug(" Reverse-geocoded point to %r" % block.pretty_name)
                    item.location_name = block.pretty_name
                    item.block = block
                except reverse.ReverseGeocodeError:
                    logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title))
                    item.location_name = u""
            item.save()
            if status == "added":
                addcount += 1
            else:
                updatecount += 1
            logger.info("%s: %s" % (status, item.title))
        except:
            logger.error("Warning: couldn't save %r. Traceback:" % item.title)
            log_exception()
    logger.info("Finished add_news: %d added, %d updated" % (addcount, updatecount))
def update():
    """ Download Calendar RSS feed and update database """
    logger.info("Starting add_events")
    url = """http://calendar.boston.com/search?acat=&cat=&commit=Search\
&new=n&rss=1&search=true&sort=0&srad=20&srss=50&ssrss=5&st=event\
&st_select=any&svt=text&swhat=&swhen=today&swhere=&trim=1"""
    schema = 'events'


    try:
        schema = Schema.objects.get(slug=schema)
    except Schema.DoesNotExist:
        logger.error("Schema (%s): DoesNotExist" % schema)
        sys.exit(1)

    feed = feedparser.parse(url)
    addcount = updatecount = 0
    for entry in feed.entries:
        title = convert_entities(entry.title)
        try:
            item = NewsItem.objects.get(title=title,
                                        schema__id=schema.id)
            status = "updated"
        except NewsItem.DoesNotExist:
            item = NewsItem()
            status = "added"
        except NewsItem.MultipleObjectsReturned:
            logger.warn("Multiple entries matched title %r, event titles are not unique?" % title)
            continue
        try:
            item.location_name = entry.get('xcal_x-calconnect-street') or entry.get('x-calconnect-street') or u''
            item.schema = schema
            item.title = title
            item.description = convert_entities(entry.description)
            item.url = entry.link
            item.item_date = datetime.datetime(*entry.updated_parsed[:6])
            item.pub_date = datetime.datetime(*entry.updated_parsed[:6])
            item.location = Point((float(entry['geo_long']),
                                   float(entry['geo_lat'])))
            if (item.location.x, item.location.y) == (0.0, 0.0):
                logger.warn("Skipping %r, bad location 0,0" % item.title)
                continue

            if not item.location_name:
                # Fall back to reverse-geocoding.
                from ebpub.geocoder import reverse
                try:
                    block, distance = reverse.reverse_geocode(item.location)
                    logger.info(" Reverse-geocoded point to %r" % block.pretty_name)
                    item.location_name = block.pretty_name
                except reverse.ReverseGeocodeError:
                    logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title))
                    item.location_name = u''

            item.save()
            if status == 'added':
                addcount += 1
            else:
                updatecount += 1
            logger.info("%s: %s" % (status, item.title))
        except:
            logger.error("unexpected error:", sys.exc_info()[1])
            log_exception()
    logger.info("add_events finished: %d added, %d updated" % (addcount, updatecount))
    def save(self, old_record, list_record, detail_record):
        # TODO: move some of this to clean_list_record?
        date = datetime.date(*list_record['updated_parsed'][:3])

        # Get the precinct from the tags.
        precincts = ['A1', 'A7', 'B2', 'B3', 'C11', 'C6', 'D14', 'D4',
                     'E13', 'E18', 'E5']
        precinct = None
        tags = [t['term'] for t in list_record['tags']]
        if not tags:
            return

        for precinct in tags:
            if precinct in precincts:
                # TODO: we need a LocationType for precincts, and shapes; and
                # then we could set newsitem.location_object to the Location
                # for this precinct.
                break

        if not precinct:
            self.logger.debug("no precinct found in tags %r" % tags)

        description = list_record['summary']

        full_description = list_record['content'][0]['value']
        full_description = text_from_html(full_description)

        addrs = parse_addresses(full_description)
        if not addrs:
            self.logger.info("no addresses found in %r %r" % (list_record['title'], 
                                                           list_record['link']))
            return

        location = None
        location_name = u''
        block = None

        # This feed doesn't provide geographic data; we'll try to
        # extract addresses from the text, and stop on the first
        # one that successfully geocodes.
        for addr, unused in addrs:
            addr = addr.strip()
            try:
                location = SmartGeocoder().geocode(addr)
            except (GeocodingException, ParsingError):
                log_exception(level=logging.DEBUG)
                continue
            location_name = location['address']
            location = location['point']
            break
        else:
            self.logger.info("no addresses geocoded in %r" % list_record['title'])
            return

        kwargs = dict(item_date=date,
                      location=location,
                      location_name=location_name,
                      title=list_record['title'],
                      description=description,
                      url=list_record['link'],
                      )
        attributes = None
        self.create_or_update(old_record, attributes, **kwargs)
Exemple #27
0
def update(argv=None):
    logger.info("Starting add_news")
    if argv:
        url = argv[0]
    else:
        url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=massachusetts&scope=bonzai'
    schema_slug = 'local-news'

    try:
        schema = Schema.objects.get(slug=schema_slug)
    except Schema.DoesNotExist:
        logger.error("Schema (%s): DoesNotExist" % schema_slug)
        sys.exit(1)

    f = feedparser.parse(url)
    addcount = updatecount = 0
    for entry in f.entries:
        title = convert_entities(entry.title)
        description = convert_entities(entry.description)

        if entry.id.startswith('http'):
            item_url = entry.id
        else:
            item_url = entry.link
        try:
            item = NewsItem.objects.get(schema__id=schema.id,
                                        title=title,
                                        description=description)
            #url=item_url)
            status = 'updated'
        except NewsItem.DoesNotExist:
            item = NewsItem()
            status = 'added'
        except NewsItem.MultipleObjectsReturned:
            # Seen some where we get the same story with multiple URLs. Why?
            logger.warn(
                "Multiple entries matched title %r and description %r. Expected unique!"
                % (title, description))
            continue
        try:
            item.title = title
            item.schema = schema
            item.description = description
            item.url = item_url
            item.location_name = entry.get('x-calconnect-street') or entry.get(
                'georss_featurename')
            item.item_date = datetime.datetime(*entry.updated_parsed[:6])
            item.pub_date = datetime.datetime(*entry.updated_parsed[:6])

            # feedparser bug: depending on which parser it magically uses,
            # we either get the xml namespace in the key name, or we don't.
            point = entry.get('georss_point') or entry.get('point')
            x, y = None, None
            if point:
                x, y = point.split(' ')
            if True:
                # Fall back on geocoding.
                text = item.title + ' ' + item.description
                addrs = parse_addresses(text)
                for addr, unused in addrs:
                    try:
                        result = SmartGeocoder().geocode(addr)
                        point = result['point']
                        logger.debug("internally geocoded %r" % addr)
                        x, y = point.x, point.y
                        break
                    except (GeocodingException, ParsingError):
                        logger.debug("Geocoding exception on %r:" % text)
                        log_exception(level=logging.DEBUG)
                        continue
                    except:
                        logger.error('uncaught geocoder exception on %r\n' %
                                     addr)
                        log_exception()
                if None in (x, y):
                    logger.info("couldn't geocode '%s...'" % item.title[:30])
                    continue
            item.location = Point((float(y), float(x)))
            if item.location.x == 0.0 and item.location.y == 0.0:
                # There's a lot of these. Maybe attempt to
                # parse and geocode if we haven't already?
                logger.info("Skipping %r as it has bad location 0,0" %
                            item.title)
                continue
            if not item.location_name:
                # Fall back to reverse-geocoding.
                from ebpub.geocoder import reverse
                try:
                    block, distance = reverse.reverse_geocode(item.location)
                    logger.debug(" Reverse-geocoded point to %r" %
                                 block.pretty_name)
                    item.location_name = block.pretty_name
                except reverse.ReverseGeocodeError:
                    logger.debug(" Failed to reverse geocode %s for %r" %
                                 (item.location.wkt, item.title))
                    item.location_name = u''
            item.save()
            if status == 'added':
                addcount += 1
            else:
                updatecount += 1
            logger.info("%s: %s" % (status, item.title))
        except:
            logger.error("Warning: couldn't save %r. Traceback:" % item.title)
            log_exception()
    logger.info("Finished add_news: %d added, %d updated" %
                (addcount, updatecount))