Ejemplo n.º 1
0
def quick_dirty_fallback_geocode(addr, parse=True):
    """
    Try to get SOME x,y even with bad blocks data,
    by falling back to external geocoders.
    """
    from ebdata.nlp.addresses import parse_addresses
    from ebpub.geocoder import SmartGeocoder
    if parse:
        addrs = parse_addresses(addr)
    else:
        addrs = [addr]
    for addr, unused in addrs:
        try:
            try:
                result = SmartGeocoder().geocode(addr)
                point = result['point']
                logger.debug("internally geocoded %r" % addr)
                return point.x, point.y
            except GeocodingException:
                logger.debug("internal geocoder failed on %r:\n" % addr)
                log_exception(level=logging.DEBUG)
                x, y = None, None
                # XXX Don't bother, external geocoding rarely gives us
                # anything inside Boston now that we have decent
                # blocks data.  But I want to preserve this script for
                # now till we figure out what to do with geocoding
                # more generally
                continue
        except:
            logger.error('uncaught geocoder exception on %r\n' % addr)
            log_exception()

    return None, None
Ejemplo n.º 2
0
def quick_dirty_fallback_geocode(addr, parse=True):
    """
    Try to get SOME x,y even with bad blocks data,
    by falling back to external geocoders.
    """
    from ebdata.nlp.addresses import parse_addresses
    from ebpub.geocoder import SmartGeocoder
    if parse:
        addrs = parse_addresses(addr)
    else:
        addrs = [addr]
    for addr, unused in addrs:
        try:
            try:
                result = SmartGeocoder().geocode(addr)
                point = result['point']
                logger.debug("internally geocoded %r" % addr)
                return point.x, point.y
            except GeocodingException:
                logger.debug("internal geocoder failed on %r:\n" % addr)
                log_exception(level=logging.DEBUG)
                x,y = None, None
                # XXX Don't bother, external geocoding rarely gives us
                # anything inside Boston now that we have decent
                # blocks data.  But I want to preserve this script for
                # now till we figure out what to do with geocoding
                # more generally
                continue
        except:
            logger.error('uncaught geocoder exception on %r\n' % addr)
            log_exception()

    return None, None
Ejemplo n.º 3
0
def get_address(text=""):
    _addresses = addresses.parse_addresses(text)
    a_dict = {'address': None, 'latitude': None, 'longitude': None}
    for address in _addresses:
        if a_dict['address']: # only use the first match for now
            continue
        a_dict = geocode(address[0], city=settings.CITY, state=settings.STATE)

    return a_dict
Ejemplo n.º 4
0
def quick_dirty_fallback_geocode(addr, parse=True):
    """
    Try to get SOME x,y even with bad blocks data,
    by falling back to external geocoders.
    """
    from ebdata.nlp.addresses import parse_addresses
    from ebpub.geocoder import SmartGeocoder
    if parse:
        addrs = parse_addresses(addr)
    else:
        addrs = [addr]
    for addr, unused in addrs:
        try:
            try:
                result = SmartGeocoder().geocode(addr)
                point = result['point']
                print "YAY internally geocoded %r" % addr
                return point.x, point.y
            except:
                x,y = None, None
                sys.stderr.write("BOO internal geocoder failed on %r:\n" % addr)
                log_exception()
                # XXX Don't bother, external geocoding rarely gives us
                # anything inside Boston now that we have decent
                # blocks data.  But I want to preserve this script for
                # now till we figure out what to do with geocoding
                # more generally
                continue
            if None in (x, y):
                # XXX log something
                # Other geocoders need to know the city
                addr += ', Boston, MA'
                from geopy import geocoders
                g = geocoders.Google(resource='maps', output_format='json')
                import urllib2
                try:
                    for unused, (lat, lon) in g.geocode(addr, exactly_one=False):
                        print "YAY google geocoded %r" % addr
                        return (lon, lat)
                except urllib2.HTTPError:
                    # Rate throttled? Try another.
                    pass
                except ValueError:
                    # Bad JSON response? why?
                    pass
                us = geocoders.GeocoderDotUS()
                for unused, (lat, lon) in us.geocode(addr, exactly_one=False):
                    print "YAY geocoder.us geocoded %r" % addr
                    return (lon, lat)
        except:
            sys.stderr.write( '===== uncaught geocoder exception on %r\n' % addr)
            log_exception()
            sys.stderr.write('======================\n')

    return None, None
Ejemplo n.º 5
0
    def geocode_if_needed(self,
                          point,
                          location_name,
                          address_text='',
                          **kwargs):
        """
        If either ``point`` or ``location_name`` is not set, try to
        geocode / reverse-geocode as needed to derive one from the
        other.  Returns (point, location_name).

        If neither one is set, try to parse addresses out of
        ``address_text`` and derive both.

        Either value may be None if it can't be determined.

        Any other keyword args are passed to ``full_geocode()``.
        """
        if not point:
            text = convert_entities(location_name or address_text)
            self.logger.debug("...Falling back on geocoding from '%s...'" %
                              text[:50])
            addrs = parse_addresses(text)
            for addr, unused in addrs:
                try:
                    result = self.geocode(addr, **kwargs)
                    if result is not None:
                        point = result['point']
                        self.logger.debug("internally geocoded %r" % addr)
                        # TODO: what if it's a Place?
                        if not location_name:
                            location_name = result['address']
                        break
                except:
                    self.logger.exception(
                        'uncaught geocoder exception on %r\n' % addr)
                    continue

        if point and not location_name:
            # Fall back to reverse-geocoding.
            from ebpub.geocoder import reverse
            try:
                block, distance = reverse.reverse_geocode(point)
                self.logger.debug(" Reverse-geocoded point to %r" %
                                  block.pretty_name)
                location_name = block.pretty_name
            except reverse.ReverseGeocodeError:
                location_name = None

        return (point, location_name)
Ejemplo n.º 6
0
    def geocode_if_needed(self, point, location_name, address_text='',
                          **kwargs):
        """
        If either ``point`` or ``location_name`` is not set, try to
        geocode / reverse-geocode as needed to derive one from the
        other.  Returns (point, location_name).

        If neither one is set, try to parse addresses out of
        ``address_text`` and derive both.

        Either value may be None if it can't be determined.

        Any other keyword args are passed to ``full_geocode()``.
        """
        if not point:
            text = convert_entities(location_name or address_text)
            self.logger.debug("...Falling back on geocoding from '%s...'" % text[:50])
            addrs = parse_addresses(text)
            for addr, unused in addrs:
                try:
                    result = self.geocode(addr, **kwargs)
                    if result is not None:
                        point = result['point']
                        self.logger.debug("internally geocoded %r" % addr)
                        # TODO: what if it's a Place?
                        if not location_name:
                            location_name = result['address']
                        break
                except:
                    self.logger.exception('uncaught geocoder exception on %r\n' % addr)
                    continue

        if point and not location_name:
            # Fall back to reverse-geocoding.
            from ebpub.geocoder import reverse
            try:
                block, distance = reverse.reverse_geocode(point)
                self.logger.debug(" Reverse-geocoded point to %r" % block.pretty_name)
                location_name = block.pretty_name
            except reverse.ReverseGeocodeError:
                location_name = None

        return (point, location_name)
Ejemplo n.º 7
0
 def assertParses(self, text, expected):
     self.assertEqual(parse_addresses(text), expected)
Ejemplo n.º 8
0
def auto_locations(paragraph_list, default_city=''):
    """
    Given a list of strings, detects all valid, unique addresses and returns a
    tuple (result, report), where result is a list of tuples in the format
    (address, wkt, excerpt, block) and report is a string of what happened.

    If default_city is given, it will be used in the geocoding for detected
    addresses that don't specify a city.
    """
    result, report = [], []
    addresses_seen = set()
    geocoder = SmartGeocoder()
    for para in paragraph_list:
        for addy, city in parse_addresses(para):
            # Skip addresses if they have a city that's a known suburb.
            if city and Suburb.objects.filter(
                    normalized_name=normalize(city)).count():
                report.append('got suburb "%s, %s"' % (addy, city))
                continue

            # Try geocoding the address. If a city was provided, first try
            # geocoding with the city, then fall back to just the address
            # (without the city).
            point = None
            attempts = [addy]
            if default_city:
                attempts.insert(0, '%s, %s' % (addy, default_city))
            if city and city.lower() != default_city.lower():
                attempts.insert(0, '%s, %s' % (addy, city))
            for attempt in attempts:
                try:
                    point = geocoder.geocode(attempt)
                    break
                except AmbiguousResult:
                    report.append('got ambiguous address "%s"' % attempt)
                    # Don't try any other address attempts, because they only
                    # get *more* ambiguous. Plus, the subsequent attempts could
                    # be incorrect. For example, with this:
                    #    addy = '100 Broadway'
                    #    city = 'Manhattan'
                    #    default_city = 'Brooklyn'
                    # There are multiple "100 Broadway" addresses in Manhattan,
                    # so geocoding should fail at this point. It should not
                    # roll back to try the default_city (Brooklyn).
                    break
                except (DoesNotExist, InvalidBlockButValidStreet):
                    report.append('got nonexistent address "%s"' % attempt)
                except ParsingError:
                    report.append('got parsing error "%s"' % attempt)
            if point is None:
                continue  # This address could not be geocoded.

            if point['address'] in addresses_seen:
                continue
            if len(para) > 300:
                try:
                    excerpt = smart_excerpt(para, addy)
                except ValueError:
                    excerpt = para
            else:
                excerpt = para
            result.append((addy, point['point'], excerpt, point['block']))
            addresses_seen.add(point['address'])
    return (result, '; '.join(report))
Ejemplo n.º 9
0
 def assertParses(self, text, expected):
     self.assertEqual(parse_addresses(text), expected)
Ejemplo n.º 10
0
    def unique_fields(self, list_record):
        # not necessarily primary key, but for this script's purposes
        # these are the fields that in combination uniquely idenfity
        # an article.
        date = datetime.date(*list_record['updated_parsed'][:3])
        precincts = ['A1', 'A7', 'B2', 'B3', 'C11', 'C6', 'D14', 'D4',
                     'E13', 'E18', 'E5']
        precinct = None
        tags = [t['term'] for t in list_record['tags']]
        if not tags:
            return
        for precinct in tags:
            if precinct in precincts:
                # TODO: we need a LocationType for precincts, and shapes; and
                # then we can set newsitem.location_object to the Location
                # for this precinct.
                break

        if not precinct:
            self.logger.debug("no precinct found in tags %r" % tags)

        if 'Boston 24' in tags:
            # TODO: the 'Boston 24' tag indicates posts with aggregate
            # daily stats.  Make a separate schema for aggregates,
            # with attributes like those used in
            # everyblock/everyblock/cities/nyc/crime_aggregate/retrieval.py.
            # These are citywide though, not by precinct.
            # So what would be the Location?  Whole city??
            self.logger.info("boston daily crime stats, we don't know how to "
                             "handle these yet")

        description = list_record['content'][0]['value']
        # TODO: we should have a stock 'clean up html' function.
        description = preprocess_to_string(
            description,
            drop_tags=('a', 'area', 'b', 'center', 'font', 'form', 'img', 'input', 'p', 'strong', 'map', 'small', 'span', 'sub', 'sup', 'topic', 'u'),
            drop_trees=('applet', 'button', 'embed', 'iframe', 'object', 'select', 'textarea'),
            drop_attrs=('background', 'border', 'cellpadding', 'cellspacing', 'class', 'clear', 'id', 'rel', 'style', 'target'))
        from ebdata.retrieval.utils import convert_entities
        description = convert_entities(description)
        #description = description.replace(' ', ' ').replace(' ', ' ')

        addrs = parse_addresses(description)
        if not addrs:
            self.logger.info("no addresses found in %r" % list_record['title'])

        location = None
        location_name = u''
        for addr, unused in addrs:
            addr = addr.strip()
            try:
                from geocoder_hack import quick_dirty_fallback_geocode
                x, y = quick_dirty_fallback_geocode(addr)
                if (x, y) != (None, None):
                    location = Point((float(x), float(y)))
                    location_name = addr.title()
            except:
                print "ugh, %r" % addr
                # XXX log something

        return dict(item_date=date,
                    location=location,
                    location_name=location_name,
                    title=list_record['title'],
                    description=description,
                    )
Ejemplo n.º 11
0
    def update(self):
        logger.info("Starting LocalNewsScraper update %s" % self.url)

        try:
            schema = Schema.objects.get(slug=self.schema_slug)
        except Schema.DoesNotExist:
            logger.error( "Schema (%s): DoesNotExist" % self.schema_slug)
            return 1

        response, content = self.http.request(self.url)
        if response.fromcache:
            logger.info("Feed is unchanged since last update (cached)")
            return

        f = feedparser.parse(content)
        addcount = updatecount = 0
        for entry in f.entries:
            title = convert_entities(entry.title)
            description = convert_entities(entry.description)

            if entry.id.startswith('http'):
                item_url = entry.id
            else:
                item_url = entry.link
            try:
                item = NewsItem.objects.get(schema__id=schema.id,
                                            title=title,
                                            description=description)
                #url=item_url)
                status = 'updated'
            except NewsItem.DoesNotExist:
                item = NewsItem()
                status = 'added'
            except NewsItem.MultipleObjectsReturned:
                # Seen some where we get the same story with multiple URLs. Why?
                logger.warn("Multiple entries matched title %r and description %r. Expected unique!" % (title, description))
                continue
            try:
                item.title = title
                item.schema = schema
                item.description = description
                item.url = item_url
                # Support both georss and xcal for getting the location name.
                # TODO: should also support ev:location per http://web.resource.org/rss/1.0/modules/event/
                item.location_name = entry.get('xCal_x-calconnect-street') or entry.get('x-calconnect-street') or entry.get('georss_featurename') or entry.get('featurename')
                item.item_date = datetime.datetime(*entry.updated_parsed[:6])
                item.pub_date = datetime.datetime(*entry.updated_parsed[:6])
                _short_title = item.title[:30] + '...'

                # feedparser bug: depending on which parser it magically uses,
                # we either get the xml namespace in the key name, or we don't.
                point = entry.get('georss_point') or entry.get('point')
                x, y = None, None
                if point:
                    # GeoRSS puts latitude (Y) first.
                    y, x = point.split(' ')
                else:
                    if item.location_name:
                        text = item.location_name
                    else:
                        # Geocode whatever we can find.
                        text = item.title + ' ' + item.description
                    logger.debug("...Falling back on geocoding from %r..." % text[:50])
                    addrs = parse_addresses(text)
                    for addr, unused in addrs:
                        try:
                            result = SmartGeocoder().geocode(addr)
                            point = result['point']
                            logger.debug("internally geocoded %r" % addr)
                            x, y = point.x, point.y
                            if not item.location_name:
                                item.location_name = result['address']
                            item.block = result['block']
                            break
                        except GeocodingException:
                            logger.debug("Geocoding exception on %r:" % text)
                            log_exception(level=logging.DEBUG)
                            continue
                        except:
                            logger.error('uncaught geocoder exception on %r\n' % addr)
                            log_exception()
                    if None in (x, y):
                        logger.debug("Skip, couldn't geocode any addresses in item '%s...'"
                                     % _short_title)
                        continue
                item.location = Point((float(x), float(y)))
                if not intersects_metro_bbox(item.location):
                    reversed_loc = Point((float(y), float(x)))
                    if intersects_metro_bbox(reversed_loc):
                        logger.info(
                            "Got points in apparently reverse order, flipping them")
                        item.location = reversed_loc
                    else:
                        logger.info("Skipping %r as %s,%s is out of bounds" %
                                    (_short_title, y, x))
                        continue
                if not item.location_name:
                    # Fall back to reverse-geocoding.
                    from ebpub.geocoder import reverse
                    try:
                        block, distance = reverse.reverse_geocode(item.location)
                        logger.debug(" Reverse-geocoded point to %r" % block.pretty_name)
                        item.location_name = block.pretty_name
                        item.block = block
                    except reverse.ReverseGeocodeError:
                        logger.info(" Skip, failed to reverse geocode %s for %r" % (item.location.wkt, _short_title))
                        continue
                item.save()
                if status == 'added':
                    addcount += 1
                else:
                    updatecount += 1
                logger.info("%s: %s" % (status, _short_title))
            except:
                logger.error("Warning: couldn't save %r. Traceback:" % _short_title)
                log_exception()
        logger.info("Finished LocalNewsScraper update: %d added, %d updated" % (addcount, updatecount))
Ejemplo n.º 12
0
    def save(self, old_record, list_record, detail_record):
        # TODO: move some of this to clean_list_record?
        date = datetime.date(*list_record["updated_parsed"][:3])

        # Get the precinct from the tags.
        precincts = ["A1", "A7", "B2", "B3", "C11", "C6", "D14", "D4", "E13", "E18", "E5"]
        precinct = None
        tags = [t["term"] for t in list_record["tags"]]
        if not tags:
            return

        for precinct in tags:
            if precinct in precincts:
                # TODO: we need a LocationType for precincts, and shapes; and
                # then we could set newsitem.location_object to the Location
                # for this precinct.
                break

        if not precinct:
            self.logger.debug("no precinct found in tags %r" % tags)

        description = list_record["summary"]

        full_description = list_record["content"][0]["value"]
        full_description = text_from_html(full_description)

        addrs = parse_addresses(full_description)
        if not addrs:
            self.logger.info("no addresses found in %r %r" % (list_record["title"], list_record["link"]))
            return

        location = None
        location_name = u""
        block = None

        # This feed doesn't provide geographic data; we'll try to
        # extract addresses from the text, and stop on the first
        # one that successfully geocodes.
        for addr, unused in addrs:
            addr = addr.strip()
            try:
                location = SmartGeocoder().geocode(addr)
            except GeocodingException:
                log_exception(level=logging.DEBUG)
                continue
            location_name = location["address"]
            block = location["block"]
            location = location["point"]
            break
        else:
            self.logger.info("no addresses geocoded in %r" % list_record["title"])
            return

        kwargs = dict(
            item_date=date,
            location=location,
            location_name=location_name,
            title=list_record["title"],
            description=description,
            url=list_record["link"],
        )
        attributes = None
        self.create_or_update(old_record, attributes, **kwargs)
Ejemplo n.º 13
0
    def update(self):
        logger.info("Starting LocalNewsScraper update %s" % self.url)

        try:
            schema = Schema.objects.get(slug=self.schema_slug)
        except Schema.DoesNotExist:
            logger.error("Schema (%s): DoesNotExist" % self.schema_slug)
            return 1

        response, content = self.http.request(self.url)
        if response.fromcache:
            logger.info("Feed is unchanged since last update (cached)")
            return

        f = feedparser.parse(content)
        addcount = updatecount = 0
        for entry in f.entries:
            title = convert_entities(entry.title)
            description = convert_entities(entry.description)

            if entry.id.startswith('http'):
                item_url = entry.id
            else:
                item_url = entry.link
            try:
                item = NewsItem.objects.get(schema__id=schema.id,
                                            title=title,
                                            description=description)
                #url=item_url)
                status = 'updated'
            except NewsItem.DoesNotExist:
                item = NewsItem()
                status = 'added'
            except NewsItem.MultipleObjectsReturned:
                # Seen some where we get the same story with multiple URLs. Why?
                logger.warn(
                    "Multiple entries matched title %r and description %r. Expected unique!"
                    % (title, description))
                continue
            try:
                item.title = title
                item.schema = schema
                item.description = description
                item.url = item_url
                # Support both georss and xcal for getting the location name.
                # TODO: should also support ev:location per http://web.resource.org/rss/1.0/modules/event/
                item.location_name = entry.get(
                    'xCal_x-calconnect-street') or entry.get(
                        'x-calconnect-street') or entry.get(
                            'georss_featurename') or entry.get('featurename')
                item.item_date = datetime.datetime(*entry.updated_parsed[:6])
                item.pub_date = datetime.datetime(*entry.updated_parsed[:6])
                _short_title = item.title[:30] + '...'

                # feedparser bug: depending on which parser it magically uses,
                # we either get the xml namespace in the key name, or we don't.
                point = entry.get('georss_point') or entry.get('point')
                x, y = None, None
                if point:
                    # GeoRSS puts latitude (Y) first.
                    y, x = point.split(' ')
                else:
                    if item.location_name:
                        text = item.location_name
                    else:
                        # Geocode whatever we can find.
                        text = item.title + ' ' + item.description
                    logger.debug("...Falling back on geocoding from %r..." %
                                 text[:50])
                    addrs = parse_addresses(text)
                    for addr, unused in addrs:
                        try:
                            result = SmartGeocoder().geocode(addr)
                            point = result['point']
                            logger.debug("internally geocoded %r" % addr)
                            x, y = point.x, point.y
                            if not item.location_name:
                                item.location_name = result['address']
                            break
                        except GeocodingException:
                            logger.debug("Geocoding exception on %r:" % text)
                            log_exception(level=logging.DEBUG)
                            continue
                        except:
                            logger.error(
                                'uncaught geocoder exception on %r\n' % addr)
                            log_exception()
                    if None in (x, y):
                        logger.debug(
                            "Skip, couldn't geocode any addresses in item '%s...'"
                            % _short_title)
                        continue
                item.location = Point((float(x), float(y)))
                if not intersects_metro_bbox(item.location):
                    reversed_loc = Point((float(y), float(x)))
                    if intersects_metro_bbox(reversed_loc):
                        logger.info(
                            "Got points in apparently reverse order, flipping them"
                        )
                        item.location = reversed_loc
                    else:
                        logger.info("Skipping %r as %s,%s is out of bounds" %
                                    (_short_title, y, x))
                        continue
                if not item.location_name:
                    # Fall back to reverse-geocoding.
                    from ebpub.geocoder import reverse
                    try:
                        block, distance = reverse.reverse_geocode(
                            item.location)
                        logger.debug(" Reverse-geocoded point to %r" %
                                     block.pretty_name)
                        item.location_name = block.pretty_name
                    except reverse.ReverseGeocodeError:
                        logger.info(
                            " Skip, failed to reverse geocode %s for %r" %
                            (item.location.wkt, _short_title))
                        continue
                item.save()
                if status == 'added':
                    addcount += 1
                else:
                    updatecount += 1
                logger.info("%s: %s" % (status, _short_title))
            except:
                logger.error("Warning: couldn't save %r. Traceback:" %
                             _short_title)
                log_exception()
        logger.info("Finished LocalNewsScraper update: %d added, %d updated" %
                    (addcount, updatecount))
Ejemplo n.º 14
0
    def save(self, old_record, list_record, detail_record):
        # TODO: move some of this to clean_list_record?
        date = datetime.date(*list_record['updated_parsed'][:3])

        # Get the precinct from the tags.
        precincts = [
            'A1', 'A7', 'B2', 'B3', 'C11', 'C6', 'D14', 'D4', 'E13', 'E18',
            'E5'
        ]
        precinct = None
        tags = [t['term'] for t in list_record['tags']]
        if not tags:
            return

        for precinct in tags:
            if precinct in precincts:
                # TODO: we need a LocationType for precincts, and shapes; and
                # then we could set newsitem.location_object to the Location
                # for this precinct.
                break

        if not precinct:
            self.logger.debug("no precinct found in tags %r" % tags)

        description = list_record['summary']

        full_description = list_record['content'][0]['value']
        full_description = text_from_html(full_description)

        addrs = parse_addresses(full_description)
        if not addrs:
            self.logger.info("no addresses found in %r %r" %
                             (list_record['title'], list_record['link']))
            return

        location = None
        location_name = u''
        block = None

        # This feed doesn't provide geographic data; we'll try to
        # extract addresses from the text, and stop on the first
        # one that successfully geocodes.
        for addr, unused in addrs:
            addr = addr.strip()
            try:
                location = SmartGeocoder().geocode(addr)
            except GeocodingException:
                log_exception(level=logging.DEBUG)
                continue
            location_name = location['address']
            block = location['block']
            location = location['point']
            break
        else:
            self.logger.info("no addresses geocoded in %r" %
                             list_record['title'])
            return

        kwargs = dict(
            item_date=date,
            location=location,
            location_name=location_name,
            title=list_record['title'],
            description=description,
            url=list_record['link'],
        )
        attributes = None
        self.create_or_update(old_record, attributes, **kwargs)
Ejemplo n.º 15
0
def auto_locations(paragraph_list, default_city=''):
    """
    Given a list of strings, detects all valid, unique addresses and returns a
    tuple (result, report), where result is a list of tuples in the format
    (address, wkt, excerpt, block) and report is a string of what happened.

    If default_city is given, it will be used in the geocoding for detected
    addresses that don't specify a city.
    """
    result, report = [], []
    addresses_seen = set()
    geocoder = SmartGeocoder()
    for para in paragraph_list:
        for addy, city in parse_addresses(para):
            # Skip addresses if they have a city that's a known suburb.
            if city and Suburb.objects.filter(normalized_name=normalize(city)).count():
                report.append('got suburb "%s, %s"' % (addy, city))
                continue

            # Try geocoding the address. If a city was provided, first try
            # geocoding with the city, then fall back to just the address
            # (without the city).
            point = None
            attempts = [addy]
            if default_city:
                attempts.insert(0, '%s, %s' % (addy, default_city))
            if city and city.lower() != default_city.lower():
                attempts.insert(0, '%s, %s' % (addy, city))
            for attempt in attempts:
                try:
                    point = geocoder.geocode(attempt)
                    break
                except AmbiguousResult:
                    report.append('got ambiguous address "%s"' % attempt)
                    # Don't try any other address attempts, because they only
                    # get *more* ambiguous. Plus, the subsequent attempts could
                    # be incorrect. For example, with this:
                    #    addy = '100 Broadway'
                    #    city = 'Manhattan'
                    #    default_city = 'Brooklyn'
                    # There are multiple "100 Broadway" addresses in Manhattan,
                    # so geocoding should fail at this point. It should not
                    # roll back to try the default_city (Brooklyn).
                    break
                except (DoesNotExist, InvalidBlockButValidStreet):
                    report.append('got nonexistent address "%s"' % attempt)
                except ParsingError:
                    report.append('got parsing error "%s"' % attempt)
            if point is None:
                continue # This address could not be geocoded.

            if point['address'] in addresses_seen:
                continue
            if len(para) > 300:
                try:
                    excerpt = smart_excerpt(para, addy)
                except ValueError:
                    excerpt = para
            else:
                excerpt = para
            result.append((addy, point['point'], excerpt, point['block']))
            addresses_seen.add(point['address'])
    return (result, '; '.join(report))
Ejemplo n.º 16
0
    def save(self, old_record, list_record, detail_record):
        # TODO: move some of this to clean_list_record?
        date = datetime.date(*list_record['updated_parsed'][:3])

        # Get the precinct from the tags.
        precincts = ['A1', 'A7', 'B2', 'B3', 'C11', 'C6', 'D14', 'D4',
                     'E13', 'E18', 'E5']
        precinct = None
        tags = [t['term'] for t in list_record['tags']]
        if not tags:
            return

        for precinct in tags:
            if precinct in precincts:
                # TODO: we need a LocationType for precincts, and shapes; and
                # then we could set newsitem.location_object to the Location
                # for this precinct.
                break

        if not precinct:
            self.logger.debug("no precinct found in tags %r" % tags)

        description = list_record['summary']

        full_description = list_record['content'][0]['value']
        full_description = text_from_html(full_description)

        addrs = parse_addresses(full_description)
        if not addrs:
            self.logger.info("no addresses found in %r %r" % (list_record['title'], 
                                                           list_record['link']))
            return

        location = None
        location_name = u''
        block = None

        # This feed doesn't provide geographic data; we'll try to
        # extract addresses from the text, and stop on the first
        # one that successfully geocodes.
        for addr, unused in addrs:
            addr = addr.strip()
            try:
                location = SmartGeocoder().geocode(addr)
            except (GeocodingException, ParsingError):
                log_exception(level=logging.DEBUG)
                continue
            location_name = location['address']
            location = location['point']
            break
        else:
            self.logger.info("no addresses geocoded in %r" % list_record['title'])
            return

        kwargs = dict(item_date=date,
                      location=location,
                      location_name=location_name,
                      title=list_record['title'],
                      description=description,
                      url=list_record['link'],
                      )
        attributes = None
        self.create_or_update(old_record, attributes, **kwargs)
Ejemplo n.º 17
0
def update(argv=None):
    logger.info("Starting add_news")
    if argv:
        url = argv[0]
    else:
        url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=massachusetts&scope=bonzai'
    schema_slug = 'local-news'

    try:
        schema = Schema.objects.get(slug=schema_slug)
    except Schema.DoesNotExist:
        logger.error( "Schema (%s): DoesNotExist" % schema_slug)
        sys.exit(1)

    f = feedparser.parse(url)
    addcount = updatecount = 0
    for entry in f.entries:
        title = convert_entities(entry.title)
        description = convert_entities(entry.description)

        if entry.id.startswith('http'):
            item_url = entry.id
        else:
            item_url = entry.link
        try:
            item = NewsItem.objects.get(schema__id=schema.id,
                                        title=title,
                                        description=description)
            #url=item_url)
            status = 'updated'
        except NewsItem.DoesNotExist:
            item = NewsItem()
            status = 'added'
        except NewsItem.MultipleObjectsReturned:
            # Seen some where we get the same story with multiple URLs. Why?
            logger.warn("Multiple entries matched title %r and description %r. Expected unique!" % (title, description))
            continue
        try:
            item.title = title
            item.schema = schema
            item.description = description
            item.url = item_url
            item.location_name = entry.get('x-calconnect-street') or entry.get('georss_featurename')
            item.item_date = datetime.datetime(*entry.updated_parsed[:6])
            item.pub_date = datetime.datetime(*entry.updated_parsed[:6])

            # feedparser bug: depending on which parser it magically uses,
            # we either get the xml namespace in the key name, or we don't.
            point = entry.get('georss_point') or entry.get('point')
            x, y = None, None
            if point:
                x, y = point.split(' ')
            if True:
                # Fall back on geocoding.
                text = item.title + ' ' + item.description
                addrs = parse_addresses(text)
                for addr, unused in addrs:
                    try:
                        result = SmartGeocoder().geocode(addr)
                        point = result['point']
                        logger.debug("internally geocoded %r" % addr)
                        x, y = point.x, point.y
                        break
                    except (GeocodingException, ParsingError):
                        logger.debug("Geocoding exception on %r:" % text,
                                     exc_info=True)
                        continue
                    except:
                        logger.exception('uncaught geocoder exception on %r\n' % addr)
                if None in (x, y):
                    logger.info("couldn't geocode '%s...'" % item.title[:30])
                    continue
            item.location = Point((float(y), float(x)))
            if item.location.x == 0.0 and item.location.y == 0.0:
                # There's a lot of these. Maybe attempt to
                # parse and geocode if we haven't already?
                logger.info("Skipping %r as it has bad location 0,0" % item.title)
                continue
            if not item.location_name:
                # Fall back to reverse-geocoding.
                from ebpub.geocoder import reverse
                try:
                    block, distance = reverse.reverse_geocode(item.location)
                    logger.debug(" Reverse-geocoded point to %r" % block.pretty_name)
                    item.location_name = block.pretty_name
                except reverse.ReverseGeocodeError:
                    logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title))
                    item.location_name = u''
            item.save()
            if status == 'added':
                addcount += 1
            else:
                updatecount += 1
            logger.info("%s: %s" % (status, item.title))
        except:
            logger.exception("Warning: couldn't save %r. Traceback:" % item.title)

    logger.info("Finished add_news: %d added, %d updated" % (addcount, updatecount))
Ejemplo n.º 18
0
def update(argv=None):
    logger.info("Starting add_news")
    if argv:
        url = argv[0]
    else:
        url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=massachusetts&scope=bonzai'
    schema_slug = 'local-news'

    try:
        schema = Schema.objects.get(slug=schema_slug)
    except Schema.DoesNotExist:
        logger.error("Schema (%s): DoesNotExist" % schema_slug)
        sys.exit(1)

    f = feedparser.parse(url)
    addcount = updatecount = 0
    for entry in f.entries:
        title = convert_entities(entry.title)
        description = convert_entities(entry.description)

        if entry.id.startswith('http'):
            item_url = entry.id
        else:
            item_url = entry.link
        try:
            item = NewsItem.objects.get(schema__id=schema.id,
                                        title=title,
                                        description=description)
            #url=item_url)
            status = 'updated'
        except NewsItem.DoesNotExist:
            item = NewsItem()
            status = 'added'
        except NewsItem.MultipleObjectsReturned:
            # Seen some where we get the same story with multiple URLs. Why?
            logger.warn(
                "Multiple entries matched title %r and description %r. Expected unique!"
                % (title, description))
            continue
        try:
            item.title = title
            item.schema = schema
            item.description = description
            item.url = item_url
            item.location_name = entry.get('x-calconnect-street') or entry.get(
                'georss_featurename')
            item.item_date = datetime.datetime(*entry.updated_parsed[:6])
            item.pub_date = datetime.datetime(*entry.updated_parsed[:6])

            # feedparser bug: depending on which parser it magically uses,
            # we either get the xml namespace in the key name, or we don't.
            point = entry.get('georss_point') or entry.get('point')
            x, y = None, None
            if point:
                x, y = point.split(' ')
            if True:
                # Fall back on geocoding.
                text = item.title + ' ' + item.description
                addrs = parse_addresses(text)
                for addr, unused in addrs:
                    try:
                        result = SmartGeocoder().geocode(addr)
                        point = result['point']
                        logger.debug("internally geocoded %r" % addr)
                        x, y = point.x, point.y
                        break
                    except (GeocodingException, ParsingError):
                        logger.debug("Geocoding exception on %r:" % text)
                        log_exception(level=logging.DEBUG)
                        continue
                    except:
                        logger.error('uncaught geocoder exception on %r\n' %
                                     addr)
                        log_exception()
                if None in (x, y):
                    logger.info("couldn't geocode '%s...'" % item.title[:30])
                    continue
            item.location = Point((float(y), float(x)))
            if item.location.x == 0.0 and item.location.y == 0.0:
                # There's a lot of these. Maybe attempt to
                # parse and geocode if we haven't already?
                logger.info("Skipping %r as it has bad location 0,0" %
                            item.title)
                continue
            if not item.location_name:
                # Fall back to reverse-geocoding.
                from ebpub.geocoder import reverse
                try:
                    block, distance = reverse.reverse_geocode(item.location)
                    logger.debug(" Reverse-geocoded point to %r" %
                                 block.pretty_name)
                    item.location_name = block.pretty_name
                except reverse.ReverseGeocodeError:
                    logger.debug(" Failed to reverse geocode %s for %r" %
                                 (item.location.wkt, item.title))
                    item.location_name = u''
            item.save()
            if status == 'added':
                addcount += 1
            else:
                updatecount += 1
            logger.info("%s: %s" % (status, item.title))
        except:
            logger.error("Warning: couldn't save %r. Traceback:" % item.title)
            log_exception()
    logger.info("Finished add_news: %d added, %d updated" %
                (addcount, updatecount))