def geocode(schema=None):
    """
    Geocode NewsItems with null locations.

    If ``schema`` is provided, only geocode NewsItems with that particular
    schema slug.
    """
    geocoder = SmartGeocoder()
    qs = NewsItem.objects.filter(location__isnull=True).order_by('-id')
    if schema is not None:
        print "Geocoding %s..." % schema
        qs = qs.filter(schema__slug=schema)
    else:
        print "Geocoding all ungeocoded newsitems..."

    geocoded_count = 0
    not_found_count = 0
    ambiguous_count = 0
    parsing_error_count = 0
    invalid_block_count = 0

    for ni in qs.iterator():
        loc_name = ni.location_name
        try:
            add = geocoder.geocode(loc_name)
        except InvalidBlockButValidStreet:
            print '      invalid block but valid street: %s' % loc_name
            invalid_block_count += 1
        except AmbiguousResult, e:
            print '      ambiguous: %s' % loc_name
            ambiguous_count += 1
        except GeocodingException, e:
            print '      not found: %s' % loc_name
            not_found_count += 1
Exemple #2
0
class TestSmartGeocoder(django.test.TestCase):
    fixtures = ["wabash.yaml"]

    def setUp(self):
        self.geocoder = SmartGeocoder(use_cache=False)

    @mock.patch("ebpub.streets.models.get_metro")
    def test_address_geocoder(self, mock_get_metro):
        mock_get_metro.return_value = {"city_name": "CHICAGO", "multiple_cities": False}
        address = self.geocoder.geocode("200 S Wabash")
        self.assertEqual(address["city"], "Chicago")

    @mock.patch("ebpub.streets.models.get_metro")
    def test_address_geocoder_ambiguous(self, mock_get_metro):
        mock_get_metro.return_value = {"city_name": "CHICAGO", "multiple_cities": False}
        self.assertRaises(AmbiguousResult, self.geocoder.geocode, "220 Wabash")

    def test_address_geocoder_invalid_block(self):
        self.assertRaises(InvalidBlockButValidStreet, self.geocoder.geocode, "100000 S Wabash")

    @mock.patch("ebpub.streets.models.get_metro")
    def test_block_geocoder(self, mock_get_metro):
        mock_get_metro.return_value = {"city_name": "CHICAGO", "multiple_cities": False}
        address = self.geocoder.geocode("200 block of Wabash")
        self.assertEqual(address["city"], "Chicago")

    def test_intersection_geocoder(self):
        address = self.geocoder.geocode("Wabash and Jackson")
        self.assertEqual(address["city"], "CHICAGO")
Exemple #3
0
 def clean(self):
     loc_info = self.cleaned_data.get('location')
     if isinstance(loc_info, list):
         # olwidget wraps geometries up as lists in case there's several per map
         assert len(loc_info) == 1
         loc_info = loc_info[0]
     if not loc_info:
         address = self.cleaned_data.get('address')
         if not address:
             self._append_error(
                 'location',
                 u'Either an address or a location must be specified.')
         else:
             # try to geocode the address...
             try:
                 geocoder = SmartGeocoder()
                 addr = geocoder.geocode(address)
                 loc_info = addr['point']
             except AmbiguousResult:
                 self._append_error(
                     'location',
                     u'Address is ambiguous, please specify a point directly.'
                 )
             except GeocodingException:
                 self._append_error(
                     'location',
                     u'Unable to geocode address, please correct the address or specify a point directly.'
                 )
         # Again, olwidget expects these to be lists...
         loc_info = [loc_info]
         self.cleaned_data['location'] = loc_info
     return super(PlaceAdminForm, self).clean()
class BaseGeocoderTestCase(django.test.TestCase):
    fixtures = ['wabash.yaml']

    def setUp(self):
        self.geocoder = SmartGeocoder(use_cache=False)

    @mock.patch('ebpub.streets.models.get_metro')
    def test_address_geocoder(self, mock_get_metro):
        mock_get_metro.return_value = {'city_name': 'CHICAGO',
                                       'multiple_cities': False}
        address = self.geocoder.geocode('200 S Wabash')
        self.assertEqual(address['city'], 'Chicago')

    @mock.patch('ebpub.streets.models.get_metro')
    def test_address_geocoder_ambiguous(self, mock_get_metro):
        mock_get_metro.return_value = {'city_name': 'CHICAGO',
                                       'multiple_cities': False}
        self.assertRaises(AmbiguousResult, self.geocoder.geocode, '220 Wabash')

    def test_address_geocoder_invalid_block(self):
        self.assertRaises(InvalidBlockButValidStreet, self.geocoder.geocode, '100000 S Wabash')

    @mock.patch('ebpub.streets.models.get_metro')
    def test_block_geocoder(self, mock_get_metro):
        mock_get_metro.return_value = {'city_name': 'CHICAGO',
                                       'multiple_cities': False}
        address = self.geocoder.geocode('200 block of Wabash')
        self.assertEqual(address['city'], 'Chicago')

    def test_intersection_geocoder(self):
        address = self.geocoder.geocode('Wabash and Jackson')
        self.assertEqual(address['city'], 'CHICAGO')
Exemple #5
0
 def __init__(self, use_cache=True):
     if not use_cache:
         self.retriever = Retriever(cache=None, sleep=self.sleep, timeout=self.timeout)
     else:
         self.retriever = Retriever(sleep=self.sleep, timeout=self.timeout)
     self.logger = logging.getLogger('eb.retrieval.%s' % self.logname)
     self.start_time = datetime.datetime.now()
     self._geocoder = SmartGeocoder()
Exemple #6
0
 def __init__(self, *args, **kwargs):
     if self.logname is None:
         self.logname = '%s.%s' % (settings.SHORT_NAME, self.schema_slugs[0])
     super(NewsItemListDetailScraper, self).__init__(*args, **kwargs)
     self._schema_cache = None
     self._schemas_cache = None
     self._lookups_cache = None
     self._schema_fields_cache = None
     self._schema_field_mapping_cache = None
     self._geocoder = SmartGeocoder()
Exemple #7
0
def full_geocode(query, search_places=True):
    """
    Tries the full geocoding stack on the given query (a string):
        * Normalizes whitespace/capitalization
        * Searches the Misspelling table to corrects location misspellings
        * Searches the Location table
        * Failing that, searches the Place table (if search_places is True)
        * Failing that, uses the given geocoder to parse this as an address
        * Failing that, raises whichever error is raised by the geocoder --
          except AmbiguousResult, in which case all possible results are
          returned

    Returns a dictionary of {type, result, ambiguous}, where ambiguous is True
    or False and type can be:
        * 'location' -- in which case result is a Location object.
        * 'place' -- in which case result is a Place object. (This is only
          possible if search_places is True.)
        * 'address' -- in which case result is an Address object as returned
          by geocoder.geocode().
        * 'block' -- in which case result is a list of Block objects.

    If ambiguous is True, result will be a list of objects.
    """
    query = normalize(query)

    # First, try correcting the spelling ("LAKEVIEW" -> "LAKE VIEW").
    try:
        miss = Misspelling.objects.get(incorrect=query)
    except Misspelling.DoesNotExist:
        pass
    else:
        query = miss.correct

    # Search the Location table.
    try:
        loc = Location.objects.get(normalized_name=query)
    except Location.DoesNotExist:
        pass
    else:
        return {'type': 'location', 'result': loc, 'ambiguous': False}

    # Search the Place table, for stuff like "Sears Tower".
    if search_places:
        places = Place.objects.filter(normalized_name=query)
        if len(places) == 1:
            return {'type': 'place', 'result': places[0], 'ambiguous': False}
        elif len(places) > 1:
            return {'type': 'place', 'result': places, 'ambiguous': True}

    # Try geocoding this as an address.
    geocoder = SmartGeocoder()
    try:
        result = geocoder.geocode(query)
    except AmbiguousResult, e:
        return {'type': 'address', 'result': e.choices, 'ambiguous': True}
Exemple #8
0
def full_geocode(query, search_places=True):
    """
    Tries the full geocoding stack on the given query (a string):
        * Normalizes whitespace/capitalization
        * Searches the Misspelling table to corrects location misspellings
        * Searches the Location table
        * Failing that, searches the Place table (if search_places is True)
        * Failing that, uses the given geocoder to parse this as an address
        * Failing that, raises whichever error is raised by the geocoder --
          except AmbiguousResult, in which case all possible results are
          returned

    Returns a dictionary of {type, result, ambiguous}, where ambiguous is True
    or False and type can be:
        * 'location' -- in which case result is a Location object.
        * 'place' -- in which case result is a Place object. (This is only
          possible if search_places is True.)
        * 'address' -- in which case result is an Address object as returned
          by geocoder.geocode().
        * 'block' -- in which case result is a list of Block objects.

    If ambiguous is True, result will be a list of objects.
    """
    query = normalize(query)

    # First, try correcting the spelling ("LAKEVIEW" -> "LAKE VIEW").
    try:
        miss = Misspelling.objects.get(incorrect=query)
    except Misspelling.DoesNotExist:
        pass
    else:
        query = miss.correct

    # Search the Location table.
    try:
        loc = Location.objects.get(normalized_name=query)
    except Location.DoesNotExist:
        pass
    else:
        return {'type': 'location', 'result': loc, 'ambiguous': False}

    # Search the Place table, for stuff like "Sears Tower".
    if search_places:
        places = Place.objects.filter(normalized_name=query)
        if len(places) == 1:
            return {'type': 'place', 'result': places[0], 'ambiguous': False}
        elif len(places) > 1:
            return {'type': 'place', 'result': places, 'ambiguous': True}

    # Try geocoding this as an address.
    geocoder = SmartGeocoder()
    try:
        result = geocoder.geocode(query)
    except AmbiguousResult, e:
        return {'type': 'address', 'result': e.choices, 'ambiguous': True}
Exemple #9
0
def geocode(schema=None):
    """
    Geocode NewsItems with null locations.

    If ``schema`` is provided, only geocode NewsItems with that particular
    schema slug.
    """
    geocoder = SmartGeocoder()
    qs = NewsItem.objects.filter(location__isnull=True).order_by('-id')
    if schema is not None:
        print "Geocoding %s..." % schema
        qs = qs.filter(schema__slug=schema)
    else:
        print "Geocoding all ungeocoded newsitems..."

    geocoded_count = 0
    not_found_count = 0
    ambiguous_count = 0
    parsing_error_count = 0
    invalid_block_count = 0

    for ni in qs.iterator():
        loc_name = ni.location_name
        try:
            add = geocoder.geocode(loc_name)
        except InvalidBlockButValidStreet:
            print '      invalid block but valid street: %s' % loc_name
            invalid_block_count += 1
        except AmbiguousResult:
            print '      ambiguous: %s' % loc_name
            ambiguous_count += 1
        except GeocodingException:
            print '      not found: %s' % loc_name
            not_found_count += 1
        except ParsingError:
            print '      parse error: %s' % loc_name
            parsing_error_count += 1
        except:
            raise
        else:
            ni.location = add['point']
            ni.block = add['block']
            ni.save()
            print '%s (%s)' % (loc_name, ni.item_url())
            geocoded_count += 1
    else:
        print "No NewsItems with null locations found"

    print "------------------------------------------------------------------"
    print "Geocoded:       %s" % geocoded_count
    print "Not found:      %s" % not_found_count
    print "Ambiguous:      %s" % ambiguous_count
    print "Parse errors:   %s" % parsing_error_count
    print "Invalid blocks: %s" % invalid_block_count
def geocode(schema=None):
    """
    Geocode NewsItems with null locations.

    If ``schema`` is provided, only geocode NewsItems with that particular
    schema slug.
    """
    geocoder = SmartGeocoder()
    qs = NewsItem.objects.filter(location__isnull=True).order_by('-id')
    if schema is not None:
        print "Geocoding %s..." % schema
        qs = qs.filter(schema__slug=schema)
    else:
        print "Geocoding all ungeocoded newsitems..."

    geocoded_count = 0
    not_found_count = 0
    ambiguous_count = 0
    parsing_error_count = 0
    invalid_block_count = 0

    for ni in qs.iterator():
        loc_name = ni.location_name
        try:
            add = geocoder.geocode(loc_name)
        except InvalidBlockButValidStreet:
            print '      invalid block but valid street: %s' % loc_name
            invalid_block_count += 1
        except AmbiguousResult:
            print '      ambiguous: %s' % loc_name
            ambiguous_count += 1
        except GeocodingException:
            print '      not found: %s' % loc_name
            not_found_count += 1
        except ParsingError:
            print '      parse error: %s' % loc_name
            parsing_error_count += 1
        except:
            raise
        else:
            ni.location = add['point']
            ni.block = add['block']
            ni.save()
            print '%s (%s)' % (loc_name, ni.item_url())
            geocoded_count += 1
    else:
        print "No NewsItems with null locations found"

    print "------------------------------------------------------------------"
    print "Geocoded:       %s" % geocoded_count
    print "Not found:      %s" % not_found_count
    print "Ambiguous:      %s" % ambiguous_count
    print "Parse errors:   %s" % parsing_error_count
    print "Invalid blocks: %s" % invalid_block_count
Exemple #11
0
def full_geocode(query, search_places=True):
    """
    Tries the full geocoding stack on the given query (a string):
        * Normalizes whitespace/capitalization
        * Searches the Misspelling table to corrects location misspellings
        * Searches the Location table
        * Failing that, searches the Place table (if search_places is True)
        * Failing that, uses the given geocoder to parse this as an address
        * Failing that, raises whichever error is raised by the geocoder --
          except AmbiguousResult, in which case all possible results are
          returned

    Returns a dictionary of {type, result, ambiguous}, where ambiguous is True
    or False and type can be:
        * 'location' -- in which case result is a Location object.
        * 'place' -- in which case result is a Place object. (This is only
          possible if search_places is True.)
        * 'address' -- in which case result is an Address object as returned
          by geocoder.geocode().
        * 'block' -- in which case result is a list of Block objects.

    If ambiguous is True, result will be a list of objects.
    """
    # Search the Location table.
    try:
        canonical_loc = LocationSynonym.objects.get_canonical(query)
        loc = Location.objects.get(normalized_name=canonical_loc)
    except Location.DoesNotExist:
        pass
    else:
        logger.debug('geocoded %r to Location %s' % (query, loc))
        return {'type': 'location', 'result': loc, 'ambiguous': False}

    # Search the Place table, for stuff like "Sears Tower".
    if search_places:
        canonical_place = PlaceSynonym.objects.get_canonical(query)
        places = Place.objects.filter(normalized_name=canonical_place)
        if len(places) == 1:
            logger.debug(u'geocoded %r to Place %s' % (query, places[0]))

            return {'type': 'place', 'result': places[0], 'ambiguous': False}
        elif len(places) > 1:
            logger.debug(u'geocoded %r to multiple Places: %s' % (query, unicode(places)))
            return {'type': 'place', 'result': places, 'ambiguous': True}

    # Try geocoding this as an address.
    geocoder = SmartGeocoder(use_cache=getattr(settings, 'EBPUB_CACHE_GEOCODER', False))
    try:
        result = geocoder.geocode(query)
    except AmbiguousResult, e:
        logger.debug('Multiple addresses for %r' % query)
        return {'type': 'address', 'result': e.choices, 'ambiguous': True}
Exemple #12
0
def add_newsitem(seed_url, seed_name, url, article_headline, article_date,
                 name_excerpts):
    schema = Schema.objects.get(slug='news-articles')
    geocoder = SmartGeocoder()
    try:
        s = Seed.objects.get(url=seed_url)
    except Seed.DoesNotExist:
        s = Seed.objects.create(
            url=seed_url,
            base_url=seed_url,
            delay=0,
            depth=0,
            is_crawled=False,
            is_rss_feed=False,
            is_active='t',
            rss_full_entry=False,
            normalize_www=3,
            pretty_name=seed_name,
            schema=schema,
            autodetect_locations=True,
            guess_article_text=False,
            strip_noise=False,
            city='',
        )
    try:
        p = Page.objects.get(url=url)
    except Page.DoesNotExist:
        html = UnicodeRetriever().fetch_data(url)
        p = Page.objects.create(seed=s,
                                url=url,
                                scraped_url=url,
                                html=html,
                                when_crawled=datetime.datetime.now(),
                                is_article=True,
                                is_pdf=False,
                                is_printer_friendly=False,
                                article_headline=article_headline,
                                article_date=article_date,
                                has_addresses=None,
                                when_geocoded=None,
                                geocoded_by='',
                                times_skipped=0,
                                robot_report='')
    data_tuples = []
    for location_name, excerpt in name_excerpts:
        point = geocoder.geocode(location_name)  # Let exceptions bubble up.
        data_tuples.append(
            (location_name, point['point'], excerpt, point['block']))
    return geotag_page(p.id, seed_name, schema, url, data_tuples,
                       article_headline, article_date)
Exemple #13
0
def add_newsitem(seed_url, seed_name, url, article_headline, article_date, name_excerpts):
    schema = Schema.objects.get(slug='news-articles')
    geocoder = SmartGeocoder()
    try:
        s = Seed.objects.get(url=seed_url)
    except Seed.DoesNotExist:
        s = Seed.objects.create(
            url=seed_url,
            base_url=seed_url,
            delay=0,
            depth=0,
            is_crawled=False,
            is_rss_feed=False,
            is_active='t',
            rss_full_entry=False,
            normalize_www=3,
            pretty_name=seed_name,
            schema=schema,
            autodetect_locations=True,
            guess_article_text=False,
            strip_noise=False,
            city='',
        )
    try:
        p = Page.objects.get(url=url)
    except Page.DoesNotExist:
        html = UnicodeRetriever().get_html(url)
        p = Page.objects.create(
            seed=s,
            url=url,
            scraped_url=url,
            html=html,
            when_crawled=datetime.datetime.now(),
            is_article=True,
            is_pdf=False,
            is_printer_friendly=False,
            article_headline=article_headline,
            article_date=article_date,
            has_addresses=None,
            when_geocoded=None,
            geocoded_by='',
            times_skipped=0,
            robot_report=''
        )
    data_tuples = []
    for location_name, excerpt in name_excerpts:
        point = geocoder.geocode(location_name) # Let exceptions bubble up.
        data_tuples.append((location_name, point['point'], excerpt, point['block']))
    return geotag_page(p.id, seed_name, schema, url, data_tuples, article_headline, article_date)
Exemple #14
0
def quick_dirty_fallback_geocode(addr, parse=True):
    """
    Try to get SOME x,y even with bad blocks data,
    by falling back to external geocoders.
    """
    from ebdata.nlp.addresses import parse_addresses
    from ebpub.geocoder import SmartGeocoder
    if parse:
        addrs = parse_addresses(addr)
    else:
        addrs = [addr]
    for addr, unused in addrs:
        try:
            try:
                result = SmartGeocoder().geocode(addr)
                point = result['point']
                logger.debug("internally geocoded %r" % addr)
                return point.x, point.y
            except GeocodingException:
                logger.debug("internal geocoder failed on %r:\n" % addr)
                log_exception(level=logging.DEBUG)
                x, y = None, None
                # XXX Don't bother, external geocoding rarely gives us
                # anything inside Boston now that we have decent
                # blocks data.  But I want to preserve this script for
                # now till we figure out what to do with geocoding
                # more generally
                continue
        except:
            logger.error('uncaught geocoder exception on %r\n' % addr)
            log_exception()

    return None, None
Exemple #15
0
class TestSmartGeocoder(django.test.TestCase):
    fixtures = ['wabash.yaml']

    def setUp(self):
        self.geocoder = SmartGeocoder(use_cache=False)

    @mock.patch('ebpub.streets.models.get_metro')
    def test_address_geocoder(self, mock_get_metro):
        mock_get_metro.return_value = {'city_name': 'CHICAGO',
                                       'multiple_cities': False}
        result = self.geocoder.geocode('200 S Wabash Ave')
        self.assertEqual(result['city'], 'Chicago')
        self.assertEqual(result['address'], '200 S Wabash Ave.')

    @mock.patch('ebpub.streets.models.get_metro')
    def test_address_geocoder__wrong_suffix_works(self, mock_get_metro):
        mock_get_metro.return_value = {'city_name': 'CHICAGO',
                                       'multiple_cities': False}
        result = self.geocoder.geocode('220 S Wabash St')
        self.assertEqual(result['address'], '220 S Wabash Ave.')
        # Or none at all.
        result = self.geocoder.geocode('220 S Wabash')
        self.assertEqual(result['address'], '220 S Wabash Ave.')

    @mock.patch('ebpub.streets.models.get_metro')
    def test_address_geocoder_ambiguous(self, mock_get_metro):
        mock_get_metro.return_value = {'city_name': 'CHICAGO',
                                       'multiple_cities': False}
        # Ambiguous because of missing pre_dir.
        self.assertRaises(AmbiguousResult, self.geocoder.geocode, '220 Wabash')

    def test_address_geocoder_invalid_block(self):
        self.assertRaises(InvalidBlockButValidStreet,
                          self.geocoder.geocode, '100000 S Wabash')

    @mock.patch('ebpub.streets.models.get_metro')
    def test_block_geocoder(self, mock_get_metro):
        mock_get_metro.return_value = {'city_name': 'CHICAGO',
                                       'multiple_cities': False}
        address = self.geocoder.geocode('200 block of Wabash')
        self.assertEqual(address['city'], 'Chicago')

    def test_intersection_geocoder(self):
        address = self.geocoder.geocode('Wabash and Jackson')
        self.assertEqual(address['city'], 'CHICAGO')
Exemple #16
0
class BaseGeocoderTestCase(django.test.TestCase):
    fixtures = ['wabash.yaml']

    def setUp(self):
        self.geocoder = SmartGeocoder(use_cache=False)

    @mock.patch('ebpub.streets.models.get_metro')
    def test_address_geocoder(self, mock_get_metro):
        mock_get_metro.return_value = {
            'city_name': 'CHICAGO',
            'multiple_cities': False
        }
        address = self.geocoder.geocode('200 S Wabash')
        self.assertEqual(address['city'], 'Chicago')

    @mock.patch('ebpub.streets.models.get_metro')
    def test_address_geocoder_ambiguous(self, mock_get_metro):
        mock_get_metro.return_value = {
            'city_name': 'CHICAGO',
            'multiple_cities': False
        }
        self.assertRaises(AmbiguousResult, self.geocoder.geocode, '220 Wabash')

    def test_address_geocoder_invalid_block(self):
        self.assertRaises(InvalidBlockButValidStreet, self.geocoder.geocode,
                          '100000 S Wabash')

    @mock.patch('ebpub.streets.models.get_metro')
    def test_block_geocoder(self, mock_get_metro):
        mock_get_metro.return_value = {
            'city_name': 'CHICAGO',
            'multiple_cities': False
        }
        address = self.geocoder.geocode('200 block of Wabash')
        self.assertEqual(address['city'], 'Chicago')

    def test_intersection_geocoder(self):
        address = self.geocoder.geocode('Wabash and Jackson')
        self.assertEqual(address['city'], 'CHICAGO')
Exemple #17
0
class BaseGeocoderTestCase(unittest.TestCase):
    fixtures = ['wabash.yaml']

    def setUp(self):
        self.geocoder = SmartGeocoder(use_cache=False)

    def test_address_geocoder(self):
        address = self.geocoder.geocode('200 S Wabash')
        self.assertEqual(address['city'], 'Chicago')

    def test_address_geocoder_ambiguous(self):
        self.assertRaises(AmbiguousResult, self.geocoder.geocode, '200 Wabash')

    def test_address_geocoder_invalid_block(self):
        self.assertRaises(InvalidBlockButValidStreet, self.geocoder.geocode, '100000 S Wabash')

    def test_block_geocoder(self):
        address = self.geocoder.geocode('200 block of Wabash')
        self.assertEqual(address['city'], 'Chicago')

    def test_intersection_geocoder(self):
        address = self.geocoder.geocode('Wabash and Jackson')
        self.assertEqual(address['city'], 'CHICAGO')
Exemple #18
0
class BaseGeocoderTestCase(unittest.TestCase):
    fixtures = ['wabash.yaml']

    def setUp(self):
        self.geocoder = SmartGeocoder(use_cache=False)

    def test_address_geocoder(self):
        address = self.geocoder.geocode('200 S Wabash')
        self.assertEqual(address['city'], 'Chicago')

    def test_address_geocoder_ambiguous(self):
        self.assertRaises(AmbiguousResult, self.geocoder.geocode, '200 Wabash')

    def test_address_geocoder_invalid_block(self):
        self.assertRaises(InvalidBlockButValidStreet, self.geocoder.geocode,
                          '100000 S Wabash')

    def test_block_geocoder(self):
        address = self.geocoder.geocode('200 block of Wabash')
        self.assertEqual(address['city'], 'Chicago')

    def test_intersection_geocoder(self):
        address = self.geocoder.geocode('Wabash and Jackson')
        self.assertEqual(address['city'], 'CHICAGO')
Exemple #19
0
 def clean(self):
     loc_info = self.cleaned_data.get('location')
     if isinstance(loc_info, list):
         # olwidget wraps geometries up as lists in case there's several per map
         assert len(loc_info) == 1
         loc_info = loc_info[0]
     if not loc_info:
         address = self.cleaned_data.get('address')
         if not address: 
             self._append_error('location', u'Either an address or a location must be specified.')
         else:
             # try to geocode the address...
             try:
                 geocoder = SmartGeocoder()
                 addr = geocoder.geocode(address) 
                 loc_info = addr['point']
             except AmbiguousResult:
                 self._append_error('location', u'Address is ambiguous, please specify a point directly.')
             except GeocodingException:
                 self._append_error('location', u'Unable to geocode address, please correct the address or specify a point directly.')
         # Again, olwidget expects these to be lists...
         loc_info = [loc_info]
         self.cleaned_data['location'] = loc_info
     return super(PlaceAdminForm, self).clean()
Exemple #20
0
def main(argv=None):
    url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=boston&scope=bonzai'
    schema = 'local-news'

    try:
        schema = Schema.objects.get(slug=schema)
    except Schema.DoesNotExist:
        print "Schema (%s): DoesNotExist" % schema
        sys.exit(0)

    f = feedparser.parse(url)
    geocoder = SmartGeocoder()

    for e in f.entries:
        try:
            item = NewsItem.objects.get(title=e.title,
                                        description=e.description)
        except NewsItem.DoesNotExist:
            item = NewsItem()
            item.schema = schema
            item.title = e.title
            item.description = e.description
            item.url = e.link
            #item.location_name = e['x-calconnect-street']
            item.item_date = datetime.datetime(*e.updated_parsed[:6])
            item.pub_date = datetime.datetime(*e.updated_parsed[:6])

            try:
                if 'point' in e:
                    x, y = e.point.split(' ')
                else:
                    x, y = e.georss_point.split(' ')
                item.location = Point((float(y), float(x)))
                item.save()
            except:
                pass

            print "Added: %s" % item.title
def fix_newsitem_coords(item, dry_run=True):
    """
    Try to fix a (presumably bad) NewsItem geometry by reversing its
    coordinates, or reverse-geocoding if it has a location name; use
    whatever works.

    If dry_run=False, the item will be saved.
    """
    if item.location is not None:
        loc = item.location.centroid
        print "Found %r outside bounds at %s, %s" % (item.title, loc.x, loc.y)
    else:
        loc = None
        print "NO location on %s: %s" % (item.schema.slug, item.title)
    fixed = False
    if item.location_name:
        from ebpub.geocoder import SmartGeocoder, AmbiguousResult
        try:
            result = SmartGeocoder().geocode(item.location_name)
        except AmbiguousResult, e:
            print "...%d choices, picking the first one" % len(e.choices)
            result = e.choices[0]
        except:
Exemple #22
0
    def update_from_query_params(self, request):
        """
        Update the filters based on query parameters.

        After this is called, it's recommended to redirect
        to a normalized form of the URL, eg. self.sort(); self.make_url()

        This takes care to preserve query parameters that aren't used
        by any of the NewsitemFilters.
        """
        # Make a mutable copy so we can leave only the params that FilterChain
        # doesn't know about.
        params = request.GET.copy()

        def pop_key(key):
            # request.GET.pop() returns a sequence.
            # We only want a single value, stripped.
            val = params.pop(key, [''])[0]
            return val.strip()

        address = pop_key('address')
        if address:
            xy_radius, block_radius, cookies_to_set = block_radius_value(
                request)
            params.pop('radius', None)
            result = None
            try:
                result = SmartGeocoder().geocode(address)
            except AmbiguousResult, e:
                raise BadAddressException(address,
                                          block_radius,
                                          address_choices=e.choices)
            except (GeocodingException, ParsingError):
                raise BadAddressException(address,
                                          block_radius,
                                          address_choices=())
Exemple #23
0
#
#   everyblock is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with everyblock.  If not, see <http://www.gnu.org/licenses/>.
#

from ebdata.retrieval.utils import locations_are_close
from ebpub.db.models import NewsItem
from ebpub.geocoder import SmartGeocoder, ParsingError, GeocodingException
from django.contrib.gis.geos import Point

geocoder = SmartGeocoder()
THRESHOLD = 375


def fix_crime_geom():
    qs = NewsItem.objects.filter(schema__slug='crime', location__isnull=False)
    count = qs.count()
    for i, ni in enumerate(qs.iterator()):
        print '# => Checking %s of %s' % (i, count)
        x, y = [float(n) for n in ni.attributes['xy'].split(';')]
        pt = Point((x, y))
        pt.srid = 4326
        location_name = ni.location_name.replace('XX', '01')
        try:
            result = geocoder.geocode(location_name)
        except (GeocodingException, ParsingError):
Exemple #24
0
class NewsItemListDetailScraper(ListDetailScraper):
    """
    A ListDetailScraper that saves its data into the NewsItem table.

    Subclasses are required to set the `schema_slugs` attribute.

    self.schemas lazily loads the list of Schema objects the first time it's
    accessed. It is a dictionary in the format {slug: Schema}.

    self.schema is available if schema_slugs has only one element. It's the
    Schema object.

    self.lookups lazily loads a dictionary of all SchemaFields with
    lookup=True. The dictionary is in the format {name: schemafield}. If
    schema_slug has more than one element, self.lookups is a dictionary in the
    format {schema_slug: {name: schemafield}}.

    self.schema_field_mapping lazily loads a dictionary of each SchemaField,
    mapping the name to the real_name. If schema_slug has more than one element,
    self.schema_field_mapping is a dictionary in the format
    {schema_slug: {name: real_name}}.
    """
    schema_slugs = None
    logname = None

    def __init__(self, *args, **kwargs):
        if self.logname is None:
            self.logname = '%s.%s' % (settings.SHORT_NAME, self.schema_slugs[0])
        super(NewsItemListDetailScraper, self).__init__(*args, **kwargs)
        self._schema_cache = None
        self._schemas_cache = None
        self._lookups_cache = None
        self._schema_fields_cache = None
        self._schema_field_mapping_cache = None
        self._geocoder = SmartGeocoder()

    # schemas, schema, lookups and schema_field_mapping are all lazily loaded
    # so that this scraper can be run (in raw_data(), xml_data() or
    # display_data()) without requiring a valid database to be set up.

    def _get_schemas(self):
        if self._schemas_cache is None:
            self._schemas_cache = dict([(s, Schema.objects.get(slug=s)) for s in self.schema_slugs])
        return self._schemas_cache
    schemas = property(_get_schemas)

    def _get_schema(self):
        if self._schema_cache is None:
            if len(self.schema_slugs) > 1:
                raise AttributeError('self.schema is only available if len(schema_slugs) == 1')
            self._schema_cache = self.schemas[self.schema_slugs[0]]
        return self._schema_cache
    schema = property(_get_schema)

    def _get_lookups(self):
        if self._lookups_cache is None:
            lc = dict([(s.slug, dict([(sf.name, sf) for sf in s.schemafield_set.filter(is_lookup=True)])) for s in self.schemas.values()])
            if len(self.schema_slugs) == 1:
                lc = lc[self.schema_slugs[0]]
            self._lookups_cache = lc
        return self._lookups_cache
    lookups = property(_get_lookups)

    def _get_schema_fields(self):
        if self._schema_fields_cache is None:
            sfs = dict([(s.slug, dict([(sf.name, sf) for sf in s.schemafield_set.all()])) for s in self.schemas.values()])
            if len(self.schema_slugs) == 1:
                sfs = sfs[self.schema_slugs[0]]
            self._schema_fields_cache = sfs
        return self._schema_fields_cache
    schema_fields = property(_get_schema_fields)

    def _get_schema_field_mapping(self):
        if self._schema_field_mapping_cache is None:
            schema_objs = self.schemas.values()
            mapping = field_mapping([s.id for s in schema_objs])
            fm = dict([(s.slug, mapping[s.id]) for s in schema_objs])
            if len(self.schema_slugs) == 1:
                fm = fm[self.schema_slugs[0]]
            self._schema_field_mapping_cache = fm
        return self._schema_field_mapping_cache
    schema_field_mapping = property(_get_schema_field_mapping)

    def get_or_create_lookup(self, schema_field_name, name, code, description='', schema=None, make_text_slug=True):
        """
        Returns the Lookup instance matching the given Schema slug, SchemaField
        name and Lookup.code, creating it (with the given name/code/description)
        if it doesn't already exist.

        If make_text_slug is True, then a slug will be created from the given
        name. If it's False, then the slug will be the Lookup's ID.
        """
        if len(self.schema_slugs) > 1:
            sf = self.lookups[schema][schema_field_name]
        else:
            sf = self.lookups[schema_field_name]
        return Lookup.objects.get_or_create_lookup(sf, name, code, description, make_text_slug, self.logger)

    @transaction.commit_on_success
    def create_newsitem(self, attributes, **kwargs):
        """
        Creates and saves a NewsItem with the given kwargs. Returns the new
        NewsItem.

        kwargs MUST have the following keys:
            title
            item_date
            location_name
        For any other kwargs whose values aren't provided, this will use
        sensible defaults.

        kwargs may optionally contain a 'convert_to_block' boolean. If True,
        this will convert the given kwargs['location_name'] to a block level
        but will use the real (non-block-level) address for geocoding and Block
        association.

        attributes is a dictionary to use to populate this NewsItem's Attribute
        object.
        """
        block = location = None
        if 'location' not in kwargs:
            location = self.geocode(kwargs['location_name'])
            if location:
                block = location['block']
                location = location['point']
        if kwargs.pop('convert_to_block', False):
            kwargs['location_name'] = address_to_block(kwargs['location_name'])
            # If the exact address couldn't be geocoded, try using the
            # normalized location name.
            if location is None:
                location = self.geocode(kwargs['location_name'])
                if location:
                    block = location['block']
                    location = location['point']

        # Normally we'd just use "schema = kwargs.get('schema', self.schema)",
        # but self.schema will be evaluated even if the key is found in
        # kwargs, which raises an error when using multiple schemas.
        schema = kwargs.get('schema', None)
        schema = schema or self.schema

        ni = NewsItem.objects.create(
            schema=schema,
            title=kwargs['title'],
            description=kwargs.get('description', ''),
            url=kwargs.get('url', ''),
            pub_date=kwargs.get('pub_date', self.start_time),
            item_date=kwargs['item_date'],
            location=kwargs.get('location', location),
            location_name=kwargs['location_name'],
            location_object=kwargs.get('location_object', None),
            block=kwargs.get('block', block),
        )
        ni.attributes = attributes
        self.num_added += 1
        self.logger.info(u'Created NewsItem %s (total created in this scrape: %s)', ni.id, self.num_added)
        return ni

    @transaction.commit_on_success
    def update_existing(self, newsitem, new_values, new_attributes):
        """
        Given an existing NewsItem and dictionaries new_values and
        new_attributes, determines which values and attributes have changed
        and saves the object and/or its attributes if necessary.
        """
        newsitem_updated = False
        # First, check the NewsItem's values.
        for k, v in new_values.items():
            if getattr(newsitem, k) != v:
                self.logger.info('ID %s %s changed from %r to %r' % (newsitem.id, k, getattr(newsitem, k), v))
                setattr(newsitem, k, v)
                newsitem_updated = True
        if newsitem_updated:
            newsitem.save()
        # Next, check the NewsItem's attributes.
        for k, v in new_attributes.items():
            if newsitem.attributes[k] != v:
                self.logger.info('ID %s %s changed from %r to %r' % (newsitem.id, k, newsitem.attributes[k], v))
                newsitem.attributes[k] = v
                newsitem_updated = True
        if newsitem_updated:
            self.num_changed += 1
            self.logger.debug('Total changed in this scrape: %s', self.num_changed)
        else:
            self.logger.debug('No changes to NewsItem %s detected', newsitem.id)

    def update(self):
        """
        Updates the Schema.last_updated fields after scraping is done.
        """
        self.num_added = 0
        self.num_changed = 0
        update_start = datetime.datetime.now()

        # We use a try/finally here so that the DataUpdate object is created
        # regardless of whether the scraper raised an exception.
        try:
            got_error = True
            super(NewsItemListDetailScraper, self).update()
            got_error = False
        finally:
            # Rollback, in case the database is in an aborted transaction. his
            # avoids the "psycopg2.ProgrammingError: current transaction is aborted,
            # commands ignored until end of transaction block" error.
            from django.db import connection
            connection._rollback()

            update_finish = datetime.datetime.now()

            # Clear the Schema cache, in case the schemas have been updated in the
            # database since we started the scrape.
            self._schemas_cache = self._schema_cache = None

            for s in self.schemas.values():
                s.last_updated = datetime.date.today()
                s.save()
                DataUpdate.objects.create(
                    schema=s,
                    update_start=update_start,
                    update_finish=update_finish,
                    num_added=self.num_added,
                    num_changed=self.num_changed,
                    # None of our scrapers delete records yet, but we have the
                    # plumbing in place here in case future scrapers need to do
                    # that.
                    num_deleted=0,
                    num_skipped=self.num_skipped,
                    got_error=got_error,
                )

    def geocode(self, location_name):
        """
        Tries to geocode the given location string, returning a Point object
        or None.
        """
        try:
            return self._geocoder.geocode(location_name)
        except (GeocodingException, ParsingError):
            return None

    def safe_location(self, location_name, geom, max_distance=200):
        """
        Returns a location (geometry) to use, given a location_name and
        geometry. This is used for data sources that publish both a geometry
        and a location_name -- we double-check that the geometry is within
        a certain `max_distance` from the geocoded location_name.

        If there's a discrepancy or if the location_name can't be geocoded,
        this returns None.
        """
        location = self.geocode(location_name)
        if location is None:
            return None
        location_point = location['point']
        if not location_point:
            return None
        location_point.srid = 4326
        is_close, distance = locations_are_close(location_point, geom, max_distance)
        if not is_close:
            return None
        return geom
Exemple #25
0
class BaseScraper(object):
    """
    Base class for all scrapers in ebdata.retrieval.scrapers.
    """
    logname = 'basescraper'
    sleep = 0
    timeout = 20

    def __init__(self, use_cache=True):
        if not use_cache:
            self.retriever = Retriever(cache=None, sleep=self.sleep, timeout=self.timeout)
        else:
            self.retriever = Retriever(sleep=self.sleep, timeout=self.timeout)
        self.logger = logging.getLogger('eb.retrieval.%s' % self.logname)
        self.start_time = datetime.datetime.now()
        self._geocoder = SmartGeocoder()

    def geocode(self, location_name, zipcode=None):
        """
        Tries to geocode the given location string, returning a Point object
        or None.
        """

        # Try to lookup the adress, if it is ambiguous, attempt to use
        # any provided zipcode information to resolve the ambiguity.
        # The zipcode is not included in the initial pass because it
        # is often too picky yeilding no results when there is a
        # legitimate nearby zipcode identified in either the address
        # or street number data.
        try:
            return self._geocoder.geocode(location_name)
        except AmbiguousResult as result: 
            # try to resolve based on zipcode...
            if zipcode is None: 
                self.logger.info(
                    "Ambiguous results for address %s. (no zipcode to resolve dispute)" % 
                    (location_name, ))
                return None
            in_zip = [r for r in result.choices if r['zip'] == zipcode]
            if len(in_zip) == 0:
                self.logger.info(
                    "Ambiguous results for address %s, but none in specified zipcode %s" % 
                    (location_name, zipcode))
                return None
            elif len(in_zip) > 1:
                self.logger.info(
                    "Ambiguous results for address %s in zipcode %s, guessing first." % 
                    (location_name, zipcode))
                return in_zip[0]
            else:
                return in_zip[0]
        except (GeocodingException, ParsingError):
            self.logger.info(
                "Could not geocode location: %s: %s" %
                (location_name, traceback.format_exc()))
            return None

    def update(self):
        'Run the scraper.'
        raise NotImplementedError()

    def fetch_data(self, *args, **kwargs):
        return self.retriever.fetch_data(*args, **kwargs)

    def get_html(self, *args, **kwargs):
        """An alias for fetch_data().
        For backward compatibility.
        """
        return self.fetch_data(*args, **kwargs)

    @classmethod
    def parse_html(cls, html):
        from lxml import etree
        from cStringIO import StringIO
        return etree.parse(StringIO(html), etree.HTMLParser())

    @transaction.commit_on_success
    def create_newsitem(self, attributes, **kwargs):
        """
        Creates and saves a NewsItem with the given kwargs. Returns the new
        NewsItem.

        kwargs MUST have the following keys:
            title
            item_date
            location_name
        For any other kwargs whose values aren't provided, this will use
        sensible defaults.
        
        kwargs MAY have the following keys: 
            zipcode - used to disambiguate geocoded locations

        kwargs may optionally contain a 'convert_to_block' boolean. If True,
        this will convert the given kwargs['location_name'] to a block level
        but will use the real (non-block-level) address for geocoding and Block
        association.

        attributes is a dictionary to use to populate this NewsItem's Attribute
        objects.
        """

        location = kwargs.get('location')
        location_name = kwargs.get('location_name')
        assert location or location_name, "At least one of location or location_name must be provided"
        if location is None:
            location = self.geocode(kwargs['location_name'], zipcode=kwargs.get('zipcode'))
            if location:
                location = location['point']
        if kwargs.pop('convert_to_block', False):
            kwargs['location_name'] = address_to_block(kwargs['location_name'])
            # If the exact address couldn't be geocoded, try using the
            # normalized location name.
            if location is None:
                location = self.geocode(kwargs['location_name'], zipcode=kwargs.get('zipcode'))
                if location:
                    location = location['point']

        # Normally we'd just use "schema = kwargs.get('schema', self.schema)",
        # but self.schema will be evaluated even if the key is found in
        # kwargs, which raises an error when using multiple schemas.
        schema = kwargs.get('schema', None) or self.schema

        ni = NewsItem.objects.create(
            schema=schema,
            title=kwargs['title'],
            description=kwargs.get('description', ''),
            url=kwargs.get('url', ''),
            pub_date=kwargs.get('pub_date', self.start_time),
            item_date=kwargs['item_date'],
            location=location,
            location_name=location_name,
            location_object=kwargs.get('location_object', None),
        )
        if attributes is not None:
            ni.attributes = attributes
        self.num_added += 1
        self.logger.info(u'Created NewsItem %s: %s (total created in this scrape: %s)', schema.slug, ni.id, self.num_added)
        return ni

    @transaction.commit_on_success
    def update_existing(self, newsitem, new_values, new_attributes):
        """
        Given an existing NewsItem and dictionaries new_values and
        new_attributes, determines which values and attributes have changed
        and saves the object and/or its attributes if necessary.
        """
        newsitem_updated = False
        # First, check the NewsItem's values.
        for k, v in new_values.items():
            if isinstance(v, datetime.datetime) and v.tzinfo is not None:
                # Django datetime fields are not timezone-aware, so we
                # can't compare them without stripping the zone.
                v = v.astimezone(local_tz).replace(tzinfo=None)
            if getattr(newsitem, k) != v:
                self.logger.info('ID %s %s changed from %r to %r' % (newsitem.id, k, getattr(newsitem, k), v))
                setattr(newsitem, k, v)
                newsitem_updated = True
        if newsitem_updated:
            newsitem.save()
        else:
            self.logger.debug("No change to %s <%s>" % (newsitem.id, newsitem))
        # Next, check the NewsItem's attributes.
        for k, v in new_attributes.items():
            if isinstance(v, datetime.datetime) and v.tzinfo is not None:
                # Django datetime fields are not timezone-aware, so we
                # can't compare them without stripping the zone.
                v = v.astimezone(local_tz).replace(tzinfo=None)
            if newsitem.attributes.get(k) == v:
                continue
            elif k not in newsitem.attributes:
                self.logger.info('ID %s %s was missing, setting to %r' %
                                 (newsitem.id, k, v))
            elif newsitem.attributes.get(k) != v:
                self.logger.info('ID %s %s changed from %r to %r' %
                                 (newsitem.id, k, newsitem.attributes[k], v))
            newsitem.attributes[k] = v
            newsitem_updated = True
        if newsitem_updated:
            self.num_changed += 1
            self.logger.debug('Total changed in this scrape: %s', self.num_changed)
        else:
            self.logger.debug('No changes to NewsItem %s detected', newsitem.id)

    def create_or_update(self, old_record, attributes, **kwargs):
        """unified API for updating or creating a NewsItem.
        """
        if old_record:
            self.update_existing(old_record, kwargs, attributes or {})
        else:
            self.create_newsitem(attributes=attributes, **kwargs)
Exemple #26
0
    def update_from_request(self, filter_sf_dict):
        """Update the list of filters based on the request params.

        After this is called, it's recommended to redirect to a
        normalized form of the URL, which you can get via self.sort();
        self.make_url()

        ``filter_sf_dict`` is a mapping of name -> SchemaField which have
        either is_filter or is_searchable True.  We remove
        SchemaFields that we create filters for. (This is so that
        templates can display input widgets for the ones we're not
        already filtering by.)

        TODO: This should not bail out on the first error,
        it should do as much as possible and signal multiple errors.
        (Use the forms framework?)
        """
        request, context = self.request, self.context
        qs = self.qs
        params = request.GET.copy()

        def pop_key(key, single=False):
            """
            Pop the value(s) from params, treat it as a
            comma-separated list of values, and split that into a
            list. So ?foo=bar,baz is equivalent to ?foo=bar&foo=baz.

            If single==True, return only the first one; in the example
            we'd return 'bar'.  Otherwise, by default, return the
            list; in the example we'd return ['bar', 'baz']
            """
            result = []
            # Doesn't seem to be a way to get a list of values *and*
            # remove it in one call; so use both getlist() and pop().
            values = params.getlist(key) or [u'']
            params.pop(key, None)
            for value in values:
                value = value.replace(u'+',
                                      u' ')  # XXX does django do this already?
                values = [s.strip() for s in value.split(u',')]
                result.extend(values)
            result = [r for r in result if r]
            if single:
                return result[0] if result else u''
            return result

        # IDs.
        ids = pop_key('id', single=False)
        if ids:
            self.replace('id', *ids)

        # Address.
        address = pop_key('address', single=True)
        if address:
            xy_radius, block_radius, cookies_to_set = block_radius_value(
                request)
            pop_key(
                'radius')  # Just to remove it, block_radius_value() used it.
            result = None
            try:
                result = SmartGeocoder().geocode(address)
            except AmbiguousResult, e:
                raise BadAddressException(address,
                                          block_radius,
                                          address_choices=e.choices)
            except (GeocodingException, ParsingError):
                raise BadAddressException(address,
                                          block_radius,
                                          address_choices=())
Exemple #27
0
    def import_csv_view(self, request):
        if not self.has_add_permission(request):
            raise PermissionDenied

        if request.method == 'GET':
            import_form = PlaceImportForm()
        if request.method == 'POST':
            import_form = PlaceImportForm(request.POST, request.FILES)

        if not import_form.is_bound or not import_form.is_valid():
            return self._show_import_csv_form(request, import_form)

        # csv fields:
        # pretty_name, address, lat, lon, url, <synonym>, <synonym>, ...

        context = dict(
            errors=[],
            actions_taken=[],
        )

        validated_rows = []

        place_type = import_form.cleaned_data['place_type']
        try:
            csvfile = import_form.cleaned_data['csv_file']
            rows = csv.reader(csvfile)
        except:
            message = "Unable to read the specified CSV file"
            context['errors'].append(message)
            return self._show_import_csv_results(request, context)

        try:
            for row in rows:
                if len(row) < 2:
                    message = "Line %d: Missing required fields." % rows.line_num
                    context['errors'].append(message)
                    continue

                synonyms = []
                point = None
                place_url = ''

                pretty_name, address = [x.strip() for x in row[0:2]]
                if pretty_name == '':
                    message = "Line %d: Empty name" % rows.line_num
                    context['errors'].append(message)
                    continue

                if len(row) > 2:
                    try:
                        lat, lon = row[2:4]
                        if lat != '' or lon != '':
                            lat = float(lat.strip())
                            lon = float(lon.strip())
                            point = geos.Point(lon, lat)
                            if len(row) > 4:
                                place_url = row[4]
                                synonyms = [x.strip() for x in row[5:]]
                    except ValueError:
                        message = 'Line %d "%s": Invalid lat, lon' % (
                            rows.line_num, pretty_name)
                        context['errors'].append(message)
                        continue

                if point is None:
                    if address == '':
                        message = 'Line %d "%s": Address and lat,lon are both empty.' % (
                            rows.line_num, pretty_name)
                        context['errors'].append(message)
                        continue

                    # try to geocode the address
                    try:
                        geocoder = SmartGeocoder()
                        addr = geocoder.geocode(address)
                        point = addr['point']
                    except AmbiguousResult:
                        message = 'Line %d "%s": Address "%s" is ambiguous, please specify a point directly.' % (
                            rows.line_num, pretty_name, address)
                        context['errors'].append(message)
                        continue
                    except GeocodingException:
                        message = 'Line %d "%s": Unable to geocode address "%s", please correct the address or specify a point directly.' % (
                            rows.line_num, pretty_name, address)
                        context['errors'].append(message)
                        continue

                # phew!
                validated_rows.append(
                    [pretty_name, address, point, place_url, synonyms])

        except csv.Error, e:
            message = "Stopped on line %d: %s" % (rows.line_num, e)
            context['errors'].append(message)
            return self._show_import_csv_results(request, context)
Exemple #28
0
    def import_csv_view(self, request):
        if not self.has_add_permission(request):
            raise PermissionDenied

        if request.method == 'GET':
            import_form = PlaceImportForm()
        if request.method == 'POST': 
            import_form = PlaceImportForm(request.POST, request.FILES)

        if not import_form.is_bound or not import_form.is_valid():
            return self._show_import_csv_form(request, import_form)

        # csv fields: 
        # pretty_name, address, lat, lon, url, <synonym>, <synonym>, ...

        context = dict(
            errors = [],
            actions_taken = [],
        )

        validated_rows = []

        place_type = import_form.cleaned_data['place_type']
        try:
            csvfile = import_form.cleaned_data['csv_file']
            rows = csv.reader(csvfile)
        except:
            message = "Unable to read the specified CSV file"
            context['errors'].append(message)
            return self._show_import_csv_results(request, context)

        try:
            for row in rows:
                if len(row) < 2:
                    message = "Line %d: Missing required fields." % rows.line_num
                    context['errors'].append(message)
                    continue
                
                synonyms = []
                point = None
                place_url = ''

                pretty_name, address = [x.strip() for x in row[0:2]]
                if pretty_name ==  '': 
                    message = "Line %d: Empty name" % rows.line_num
                    context['errors'].append(message)
                    continue

                if len(row) > 2:
                    try:
                        lat, lon  = row[2:4]
                        if lat != '' or lon != '':
                            lat = float(lat.strip())
                            lon = float(lon.strip())
                            point = geos.Point(lon, lat)
                            if len(row) > 4:
                                place_url = row[4]
                                synonyms = [x.strip() for x in row[5:]]
                    except ValueError: 
                        message = 'Line %d "%s": Invalid lat, lon' % (rows.line_num, pretty_name)
                        context['errors'].append(message)
                        continue

                
                if point is None:
                    if address == '':
                        message = 'Line %d "%s": Address and lat,lon are both empty.' % (rows.line_num, pretty_name)
                        context['errors'].append(message)
                        continue

                    # try to geocode the address
                    try:
                        geocoder = SmartGeocoder()
                        addr = geocoder.geocode(address) 
                        point = addr['point']
                    except AmbiguousResult:
                        message = 'Line %d "%s": Address "%s" is ambiguous, please specify a point directly.' % (rows.line_num, pretty_name, address)
                        context['errors'].append(message)
                        continue
                    except GeocodingException:
                        message = 'Line %d "%s": Unable to geocode address "%s", please correct the address or specify a point directly.' % (rows.line_num, pretty_name, address)
                        context['errors'].append(message)
                        continue
                
                # phew!
                validated_rows.append([pretty_name, address, point, place_url, synonyms])

        except csv.Error, e:
            message = "Stopped on line %d: %s" % (rows.line_num, e)
            context['errors'].append(message)
            return self._show_import_csv_results(request, context)
class NewsItemListDetailScraper(ListDetailScraper):
    """
    A ListDetailScraper that saves its data into the NewsItem table.

    Subclasses are required to set the `schema_slugs` attribute.

    Once you've set schema_slugs, there are a number of properties for
    conveniently accessing the relevant Schemas and SchemaFields:

    self.schemas lazily loads the list of Schema objects the first time it's
    accessed. It is a dictionary in the format {slug: Schema}.

    self.schema is available if schema_slugs has only one element. It's the
    Schema object.

    self.lookups lazily loads a dictionary of all SchemaFields with
    lookup=True. The dictionary is in the format {name: schemafield}.
    If schema_slugs has more than one element, self.lookups is a
    dictionary in the format {schema_slug: {name: schemafield}}.

    self.schema_fields lazily loads a dictionary of each SchemaField,
    mapping the name to the SchemaField object.
    If schema_slugs has more than one element, self.schema_fields is a
    dictionary in the format {schema_slug: {name: schema_field}}.

    self.schema_field_mapping lazily loads a dictionary of each
    SchemaField, mapping the name to the real_name.
    If schema_slugs has more than one element, self.schema_field_mapping
    is a dictionary in the format {schema_slug: {name: real_name}}.
    """
    schema_slugs = None
    logname = None

    def __init__(self, *args, **kwargs):
        if self.logname is None:
            self.logname = '%s.%s' % (settings.SHORT_NAME, self.schema_slugs[0])
        super(NewsItemListDetailScraper, self).__init__(*args, **kwargs)
        self._schema_cache = None
        self._schemas_cache = None
        self._lookups_cache = None
        self._schema_fields_cache = None
        self._schema_field_mapping_cache = None
        self._geocoder = SmartGeocoder()

    # schemas, schema, lookups and schema_field_mapping are all lazily loaded
    # so that this scraper can be run (in raw_data(), xml_data() or
    # display_data()) without requiring a valid database to be set up.

    @property
    def schemas(self):
        if self._schemas_cache is None:
            self._schemas_cache = dict([(s, Schema.objects.get(slug=s)) for s in self.schema_slugs])
        return self._schemas_cache

    @property
    def schema(self):
        if self._schema_cache is None:
            if len(self.schema_slugs) > 1:
                raise AttributeError('self.schema is only available if len(schema_slugs) == 1')
            self._schema_cache = self.schemas[self.schema_slugs[0]]
        return self._schema_cache

    @property
    def lookups(self):
        if self._lookups_cache is None:
            lc = dict([(s.slug, dict([(sf.name, sf) for sf in s.schemafield_set.filter(is_lookup=True)])) for s in self.schemas.values()])
            if len(self.schema_slugs) == 1:
                lc = lc[self.schema_slugs[0]]
            self._lookups_cache = lc
        return self._lookups_cache

    @property
    def schema_fields(self):
        if self._schema_fields_cache is None:
            sfs = dict([(s.slug, dict([(sf.name, sf)
                                       for sf in s.schemafield_set.all()]))
                        for s in self.schemas.values()])
            if len(self.schema_slugs) == 1:
                sfs = sfs[self.schema_slugs[0]]
            self._schema_fields_cache = sfs
        return self._schema_fields_cache

    @property
    def schema_field_mapping(self):
        if self._schema_field_mapping_cache is None:
            schema_objs = self.schemas.values()
            mapping = field_mapping([s.id for s in schema_objs])
            fm = dict([(s.slug, mapping[s.id]) for s in schema_objs])
            if len(self.schema_slugs) == 1:
                fm = fm[self.schema_slugs[0]]
            self._schema_field_mapping_cache = fm
        return self._schema_field_mapping_cache


    def get_or_create_lookup(self, schema_field_name, name, code, description='', schema=None, make_text_slug=True):
        """
        Returns the Lookup instance matching the given Schema slug, SchemaField
        name and Lookup.code, creating it (with the given name/code/description)
        if it doesn't already exist.

        If make_text_slug is True, then a slug will be created from the given
        name. If it's False, then the slug will be the Lookup's ID.
        """
        if len(self.schema_slugs) > 1:
            sf = self.lookups[schema][schema_field_name]
        else:
            sf = self.lookups[schema_field_name]
        return Lookup.objects.get_or_create_lookup(sf, name, code, description, make_text_slug, self.logger)


    @transaction.commit_on_success
    def create_newsitem(self, attributes, **kwargs):
        """
        Creates and saves a NewsItem with the given kwargs. Returns the new
        NewsItem.

        kwargs MUST have the following keys:
            title
            item_date
            location_name
        For any other kwargs whose values aren't provided, this will use
        sensible defaults.
        
        kwargs MAY have the following keys: 
            zipcode - used to disambiguate geocoded locations

        kwargs may optionally contain a 'convert_to_block' boolean. If True,
        this will convert the given kwargs['location_name'] to a block level
        but will use the real (non-block-level) address for geocoding and Block
        association.

        attributes is a dictionary to use to populate this NewsItem's Attribute
        objects.
        """

        block = kwargs.get('block')
        location = kwargs.get('location')
        location_name = kwargs.get('location_name')
        assert location or location_name, "At least one of location or location_name must be provided"
        if location is None:
            location = self.geocode(kwargs['location_name'], zipcode=kwargs.get('zipcode'))
            if location:
                block = location['block']
                location = location['point']
        if kwargs.pop('convert_to_block', False):
            kwargs['location_name'] = address_to_block(kwargs['location_name'])
            # If the exact address couldn't be geocoded, try using the
            # normalized location name.
            if location is None:
                location = self.geocode(kwargs['location_name'], zipcode=kwargs.get('zipcode'))
                if location:
                    block = location['block']
                    location = location['point']

        # Normally we'd just use "schema = kwargs.get('schema', self.schema)",
        # but self.schema will be evaluated even if the key is found in
        # kwargs, which raises an error when using multiple schemas.
        schema = kwargs.get('schema', None) or self.schema

        ni = NewsItem.objects.create(
            schema=schema,
            title=kwargs['title'],
            description=kwargs.get('description', ''),
            url=kwargs.get('url', ''),
            pub_date=kwargs.get('pub_date', self.start_time),
            item_date=kwargs['item_date'],
            location=location,
            location_name=location_name,
            location_object=kwargs.get('location_object', None),
            block=block,
        )
        if attributes is not None:
            ni.attributes = attributes
        self.num_added += 1
        self.logger.info(u'Created NewsItem %s: %s (total created in this scrape: %s)', schema.slug, ni.id, self.num_added)
        return ni

    @transaction.commit_on_success
    def update_existing(self, newsitem, new_values, new_attributes):
        """
        Given an existing NewsItem and dictionaries new_values and
        new_attributes, determines which values and attributes have changed
        and saves the object and/or its attributes if necessary.
        """
        newsitem_updated = False
        # First, check the NewsItem's values.
        for k, v in new_values.items():
            if isinstance(v, datetime.datetime) and v.tzinfo is not None:
                # Django datetime fields are not timezone-aware, so we
                # can't compare them without stripping the zone.
                v = v.astimezone(local_tz).replace(tzinfo=None)
            if getattr(newsitem, k) != v:
                self.logger.info('ID %s %s changed from %r to %r' % (newsitem.id, k, getattr(newsitem, k), v))
                setattr(newsitem, k, v)
                newsitem_updated = True
        if newsitem_updated:
            newsitem.save()
        else:
            self.logger.debug("No change to %s <%s>" % (newsitem.id, newsitem))
        # Next, check the NewsItem's attributes.
        for k, v in new_attributes.items():
            if isinstance(v, datetime.datetime) and v.tzinfo is not None:
                # Django datetime fields are not timezone-aware, so we
                # can't compare them without stripping the zone.
                v = v.astimezone(local_tz).replace(tzinfo=None)
            if newsitem.attributes.get(k) == v:
                continue
            elif k not in newsitem.attributes:
                self.logger.info('ID %s %s was missing, setting to %r' %
                                 (newsitem.id, k, v))
            elif newsitem.attributes.get(k) != v:
                self.logger.info('ID %s %s changed from %r to %r' %
                                 (newsitem.id, k, newsitem.attributes[k], v))
            newsitem.attributes[k] = v
            newsitem_updated = True
        if newsitem_updated:
            self.num_changed += 1
            self.logger.debug('Total changed in this scrape: %s', self.num_changed)
        else:
            self.logger.debug('No changes to NewsItem %s detected', newsitem.id)

    def create_or_update(self, old_record, attributes, **kwargs):
        """unified API for updating or creating a NewsItem.
        """
        if old_record:
            self.update_existing(old_record, kwargs, attributes or {})
        else:
            self.create_newsitem(attributes=attributes, **kwargs)


    def update(self):
        """
        Updates the Schema.last_updated fields after scraping is done.
        """
        self.num_added = 0
        self.num_changed = 0
        update_start = datetime.datetime.now()

        # We use a try/finally here so that the DataUpdate object is created
        # regardless of whether the scraper raised an exception.
        try:
            got_error = True
            super(NewsItemListDetailScraper, self).update()
            got_error = False
        finally:
            # Rollback, in case the database is in an aborted
            # transaction. This avoids the "psycopg2.ProgrammingError:
            # current transaction is aborted, commands ignored until
            # end of transaction block" error.
            from django.db import connection
            connection._rollback()

            update_finish = datetime.datetime.now()

            # Clear the Schema cache, in case the schemas have been
            # updated in the database since we started the scrape.
            self._schemas_cache = self._schema_cache = None

            for s in self.schemas.values():
                s.last_updated = datetime.date.today()
                s.save()
                DataUpdate.objects.create(
                    schema=s,
                    update_start=update_start,
                    update_finish=update_finish,
                    num_added=self.num_added,
                    num_changed=self.num_changed,
                    # None of our scrapers delete records yet, but we have the
                    # plumbing in place here in case future scrapers need to do
                    # that.
                    num_deleted=0,
                    num_skipped=self.num_skipped,
                    got_error=got_error,
                )

    def geocode(self, location_name, zipcode=None):
        """
        Tries to geocode the given location string, returning a Point object
        or None.
        """

        # Try to lookup the adress, if it is ambiguous, attempt to use 
        # any provided zipcode information to resolve the ambiguity. 
        # The zipcode is not included in the initial pass because it 
        # is often too picky yeilding no results when there is a 
        # legitimate nearby zipcode identified in either the address
        # or street number data.
        try:
            return self._geocoder.geocode(location_name)
        except AmbiguousResult as result: 
            # try to resolve based on zipcode...
            if zipcode is None: 
                self.logger.warning("Ambiguous results for address %s. (no zipcode to resolve dispute)" % (location_name, ))
                return None
            in_zip = [r for r in result.choices if r['zip'] == zipcode]
            if len(in_zip) == 0: 
                self.logger.warning("Ambiguous results for address %s, but none in specified zipcode %s" % (location_name, zipcode))
                return None
            if len(in_zip) > 1:
                self.logger.warning("Ambiguous results for address %s in zipcode %s, guessing first." % (location_name, zipcode))
                return in_zip[0]
            else: 
                return in_zip[0]             
        except (GeocodingException, ParsingError):
            self.logger.warning("Could not geocode location: %s: %s" % (location_name, traceback.format_exc()))
            return None


    def safe_location(self, location_name, geom, max_distance=200):
        """
        Returns a location (geometry) to use, given a location_name and
        geometry. This is used for data sources that publish both a geometry
        and a location_name -- we double-check that the geometry is within
        a certain `max_distance` from the geocoded location_name.

        If there's a discrepancy or if the location_name can't be geocoded,
        this returns None.
        """
        location = self.geocode(location_name)
        if location is None:
            return None
        location_point = location['point']
        if not location_point:
            return None
        location_point.srid = 4326
        is_close, distance = locations_are_close(location_point, geom, max_distance)
        if not is_close:
            return None
        return geom

    def last_updated_time(self, schema=None):
        """
        Returns a DateTime representing the last time we started
        scraping our schema(s).  (We use start time rather than end
        time on the assumption that a few overlaps are preferable to
        missing updates.)
        """
        schema = schema or self.schema
        try:
            last_update = DataUpdate.objects.order_by('update_start')[0]
            return last_update.update_start
        except IndexError:
            # Use the unix epoch (1970) as a stand-in for "never updated".
            return datetime.datetime.fromtimestamp(0)
Exemple #30
0
    def save(self, old_record, list_record, detail_record):
        # TODO: move some of this to clean_list_record?
        date = datetime.date(*list_record['updated_parsed'][:3])

        # Get the precinct from the tags.
        precincts = [
            'A1', 'A7', 'B2', 'B3', 'C11', 'C6', 'D14', 'D4', 'E13', 'E18',
            'E5'
        ]
        precinct = None
        tags = [t['term'] for t in list_record['tags']]
        if not tags:
            return

        for precinct in tags:
            if precinct in precincts:
                # TODO: we need a LocationType for precincts, and shapes; and
                # then we could set newsitem.location_object to the Location
                # for this precinct.
                break

        if not precinct:
            self.logger.debug("no precinct found in tags %r" % tags)

        description = list_record['summary']

        full_description = list_record['content'][0]['value']
        full_description = text_from_html(full_description)

        addrs = parse_addresses(full_description)
        if not addrs:
            self.logger.info("no addresses found in %r %r" %
                             (list_record['title'], list_record['link']))
            return

        location = None
        location_name = u''
        block = None

        # This feed doesn't provide geographic data; we'll try to
        # extract addresses from the text, and stop on the first
        # one that successfully geocodes.
        for addr, unused in addrs:
            addr = addr.strip()
            try:
                location = SmartGeocoder().geocode(addr)
            except GeocodingException:
                log_exception(level=logging.DEBUG)
                continue
            location_name = location['address']
            block = location['block']
            location = location['point']
            break
        else:
            self.logger.info("no addresses geocoded in %r" %
                             list_record['title'])
            return

        kwargs = dict(
            item_date=date,
            location=location,
            location_name=location_name,
            title=list_record['title'],
            description=description,
            url=list_record['link'],
        )
        attributes = None
        self.create_or_update(old_record, attributes, **kwargs)
Exemple #31
0
def auto_locations(paragraph_list, default_city=''):
    """
    Given a list of strings, detects all valid, unique addresses and returns a
    tuple (result, report), where result is a list of tuples in the format
    (address, wkt, excerpt, block) and report is a string of what happened.

    If default_city is given, it will be used in the geocoding for detected
    addresses that don't specify a city.
    """
    result, report = [], []
    addresses_seen = set()
    geocoder = SmartGeocoder()
    for para in paragraph_list:
        for addy, city in parse_addresses(para):
            # Skip addresses if they have a city that's a known suburb.
            if city and Suburb.objects.filter(
                    normalized_name=normalize(city)).count():
                report.append('got suburb "%s, %s"' % (addy, city))
                continue

            # Try geocoding the address. If a city was provided, first try
            # geocoding with the city, then fall back to just the address
            # (without the city).
            point = None
            attempts = [addy]
            if default_city:
                attempts.insert(0, '%s, %s' % (addy, default_city))
            if city and city.lower() != default_city.lower():
                attempts.insert(0, '%s, %s' % (addy, city))
            for attempt in attempts:
                try:
                    point = geocoder.geocode(attempt)
                    break
                except AmbiguousResult:
                    report.append('got ambiguous address "%s"' % attempt)
                    # Don't try any other address attempts, because they only
                    # get *more* ambiguous. Plus, the subsequent attempts could
                    # be incorrect. For example, with this:
                    #    addy = '100 Broadway'
                    #    city = 'Manhattan'
                    #    default_city = 'Brooklyn'
                    # There are multiple "100 Broadway" addresses in Manhattan,
                    # so geocoding should fail at this point. It should not
                    # roll back to try the default_city (Brooklyn).
                    break
                except (DoesNotExist, InvalidBlockButValidStreet):
                    report.append('got nonexistent address "%s"' % attempt)
                except ParsingError:
                    report.append('got parsing error "%s"' % attempt)
            if point is None:
                continue  # This address could not be geocoded.

            if point['address'] in addresses_seen:
                continue
            if len(para) > 300:
                try:
                    excerpt = smart_excerpt(para, addy)
                except ValueError:
                    excerpt = para
            else:
                excerpt = para
            result.append((addy, point['point'], excerpt, point['block']))
            addresses_seen.add(point['address'])
    return (result, '; '.join(report))
Exemple #32
0
class BaseScraper(object):
    """
    Base class for all scrapers in ebdata.retrieval.scrapers.
    """
    logname = 'basescraper'
    sleep = 0
    timeout = 20

    def __init__(self, use_cache=True):
        if not use_cache:
            self.retriever = Retriever(cache=None, sleep=self.sleep, timeout=self.timeout)
        else:
            self.retriever = Retriever(sleep=self.sleep, timeout=self.timeout)
        self.logger = logging.getLogger('eb.retrieval.%s' % self.logname)
        self.start_time = datetime.datetime.now()
        self._geocoder = SmartGeocoder()
        self.num_added = 0
        self.num_changed = 0

    def geocode(self, location_name, zipcode=None):
        """
        Tries to geocode the given location string, returning a Point object
        or None.
        """

        # Try to lookup the adress, if it is ambiguous, attempt to use
        # any provided zipcode information to resolve the ambiguity.
        # The zipcode is not included in the initial pass because it
        # is often too picky yeilding no results when there is a
        # legitimate nearby zipcode identified in either the address
        # or street number data.
        try:
            return self._geocoder.geocode(location_name)
        except AmbiguousResult as result: 
            # try to resolve based on zipcode...
            if zipcode is None: 
                self.logger.info(
                    "Ambiguous results for address %s. (no zipcode to resolve dispute)" % 
                    (location_name, ))
                return None
            in_zip = [r for r in result.choices if r['zip'] == zipcode]
            if len(in_zip) == 0:
                self.logger.info(
                    "Ambiguous results for address %s, but none in specified zipcode %s" % 
                    (location_name, zipcode))
                return None
            elif len(in_zip) > 1:
                self.logger.info(
                    "Ambiguous results for address %s in zipcode %s, guessing first." % 
                    (location_name, zipcode))
                return in_zip[0]
            else:
                return in_zip[0]
        except (GeocodingException, ParsingError):
            self.logger.info(
                "Could not geocode location: %s: %s" %
                (location_name, traceback.format_exc()))
            return None

    def update(self):
        'Run the scraper.'
        raise NotImplementedError()

    def fetch_data(self, *args, **kwargs):
        return self.retriever.fetch_data(*args, **kwargs)

    def get_html(self, *args, **kwargs):
        """An alias for fetch_data().
        For backward compatibility.
        """
        return self.fetch_data(*args, **kwargs)

    @classmethod
    def parse_html(cls, html):
        from lxml import etree
        from cStringIO import StringIO
        return etree.parse(StringIO(html), etree.HTMLParser())

    @transaction.commit_on_success
    def create_newsitem(self, attributes, **kwargs):
        """
        Creates and saves a NewsItem with the given kwargs. Returns the new
        NewsItem.

        kwargs MUST have the following keys:
            title
            item_date
            location_name
        For any other kwargs whose values aren't provided, this will use
        sensible defaults.
        
        kwargs MAY have the following keys: 
            zipcode - used to disambiguate geocoded locations

        kwargs may optionally contain a 'convert_to_block' boolean. If True,
        this will convert the given kwargs['location_name'] to a block level
        but will use the real (non-block-level) address for geocoding and Block
        association.

        attributes is a dictionary to use to populate this NewsItem's Attribute
        objects.
        """

        location = kwargs.get('location')
        location_name = kwargs.get('location_name')
        assert location or location_name, "At least one of location or location_name must be provided"
        if location is None:
            location = self.geocode(kwargs['location_name'], zipcode=kwargs.get('zipcode'))
            if location:
                location = location['point']
        if kwargs.pop('convert_to_block', False):
            kwargs['location_name'] = address_to_block(kwargs['location_name'])
            # If the exact address couldn't be geocoded, try using the
            # normalized location name.
            if location is None:
                location = self.geocode(kwargs['location_name'], zipcode=kwargs.get('zipcode'))
                if location:
                    location = location['point']

        # Normally we'd just use "schema = kwargs.get('schema', self.schema)",
        # but self.schema will be evaluated even if the key is found in
        # kwargs, which raises an error when using multiple schemas.
        schema = kwargs.get('schema', None) or self.schema

        ni = NewsItem.objects.create(
            schema=schema,
            title=kwargs['title'],
            description=kwargs.get('description', ''),
            url=kwargs.get('url', ''),
            pub_date=kwargs.get('pub_date', self.start_time),
            item_date=kwargs['item_date'],
            location=location,
            location_name=location_name,
            location_object=kwargs.get('location_object', None),
        )
        if attributes is not None:
            ni.attributes = attributes
        self.num_added += 1
        self.logger.info(u'Created NewsItem %s: %s (total created in this scrape: %s)', schema.slug, ni.id, self.num_added)
        return ni

    @transaction.commit_on_success
    def update_existing(self, newsitem, new_values, new_attributes):
        """
        Given an existing NewsItem and dictionaries new_values and
        new_attributes, determines which values and attributes have changed
        and saves the object and/or its attributes if necessary.
        """
        newsitem_updated = False
        # First, check the NewsItem's values.
        for k, v in new_values.items():
            if isinstance(v, datetime.datetime) and v.tzinfo is not None:
                # Django datetime fields are not timezone-aware, so we
                # can't compare them without stripping the zone.
                v = v.astimezone(local_tz).replace(tzinfo=None)
            if getattr(newsitem, k) != v:
                self.logger.info('ID %s %s changed from %r to %r' % (newsitem.id, k, getattr(newsitem, k), v))
                setattr(newsitem, k, v)
                newsitem_updated = True
        if newsitem_updated:
            newsitem.save()
        else:
            self.logger.debug("No change to %s <%s>" % (newsitem.id, newsitem))
        # Next, check the NewsItem's attributes.
        for k, v in new_attributes.items():
            if isinstance(v, datetime.datetime) and v.tzinfo is not None:
                # Django datetime fields are not timezone-aware, so we
                # can't compare them without stripping the zone.
                v = v.astimezone(local_tz).replace(tzinfo=None)
            if newsitem.attributes.get(k) == v:
                continue
            elif k not in newsitem.attributes:
                self.logger.info('ID %s %s was missing, setting to %r' %
                                 (newsitem.id, k, v))
            elif newsitem.attributes.get(k) != v:
                self.logger.info('ID %s %s changed from %r to %r' %
                                 (newsitem.id, k, newsitem.attributes[k], v))
            newsitem.attributes[k] = v
            newsitem_updated = True
        if newsitem_updated:
            self.num_changed += 1
            self.logger.debug('Total changed in this scrape: %s', self.num_changed)
        else:
            self.logger.debug('No changes to NewsItem %s detected', newsitem.id)

    def create_or_update(self, old_record, attributes, **kwargs):
        """unified API for updating or creating a NewsItem.
        """
        if old_record:
            self.update_existing(old_record, kwargs, attributes or {})
        else:
            self.create_newsitem(attributes=attributes, **kwargs)
Exemple #33
0
 def setUp(self):
     self.geocoder = SmartGeocoder(use_cache=False)
Exemple #34
0
 def setUp(self):
     self.geocoder = SmartGeocoder(use_cache=False)
def auto_locations(paragraph_list, default_city=''):
    """
    Given a list of strings, detects all valid, unique addresses and returns a
    tuple (result, report), where result is a list of tuples in the format
    (address, wkt, excerpt, block) and report is a string of what happened.

    If default_city is given, it will be used in the geocoding for detected
    addresses that don't specify a city.
    """
    result, report = [], []
    addresses_seen = set()
    geocoder = SmartGeocoder()
    for para in paragraph_list:
        for addy, city in parse_addresses(para):
            # Skip addresses if they have a city that's a known suburb.
            if city and Suburb.objects.filter(normalized_name=normalize(city)).count():
                report.append('got suburb "%s, %s"' % (addy, city))
                continue

            # Try geocoding the address. If a city was provided, first try
            # geocoding with the city, then fall back to just the address
            # (without the city).
            point = None
            attempts = [addy]
            if default_city:
                attempts.insert(0, '%s, %s' % (addy, default_city))
            if city and city.lower() != default_city.lower():
                attempts.insert(0, '%s, %s' % (addy, city))
            for attempt in attempts:
                try:
                    point = geocoder.geocode(attempt)
                    break
                except AmbiguousResult:
                    report.append('got ambiguous address "%s"' % attempt)
                    # Don't try any other address attempts, because they only
                    # get *more* ambiguous. Plus, the subsequent attempts could
                    # be incorrect. For example, with this:
                    #    addy = '100 Broadway'
                    #    city = 'Manhattan'
                    #    default_city = 'Brooklyn'
                    # There are multiple "100 Broadway" addresses in Manhattan,
                    # so geocoding should fail at this point. It should not
                    # roll back to try the default_city (Brooklyn).
                    break
                except (DoesNotExist, InvalidBlockButValidStreet):
                    report.append('got nonexistent address "%s"' % attempt)
                except ParsingError:
                    report.append('got parsing error "%s"' % attempt)
            if point is None:
                continue # This address could not be geocoded.

            if point['address'] in addresses_seen:
                continue
            if len(para) > 300:
                try:
                    excerpt = smart_excerpt(para, addy)
                except ValueError:
                    excerpt = para
            else:
                excerpt = para
            result.append((addy, point['point'], excerpt, point['block']))
            addresses_seen.add(point['address'])
    return (result, '; '.join(report))
Exemple #36
0
    def update(self):
        logger.info("Starting LocalNewsScraper update %s" % self.url)

        try:
            schema = Schema.objects.get(slug=self.schema_slug)
        except Schema.DoesNotExist:
            logger.error("Schema (%s): DoesNotExist" % self.schema_slug)
            return 1

        response, content = self.http.request(self.url)
        if response.fromcache:
            logger.info("Feed is unchanged since last update (cached)")
            return

        f = feedparser.parse(content)
        addcount = updatecount = 0
        for entry in f.entries:
            title = convert_entities(entry.title)
            description = convert_entities(entry.description)

            if entry.id.startswith('http'):
                item_url = entry.id
            else:
                item_url = entry.link
            try:
                item = NewsItem.objects.get(schema__id=schema.id,
                                            title=title,
                                            description=description)
                #url=item_url)
                status = 'updated'
            except NewsItem.DoesNotExist:
                item = NewsItem()
                status = 'added'
            except NewsItem.MultipleObjectsReturned:
                # Seen some where we get the same story with multiple URLs. Why?
                logger.warn(
                    "Multiple entries matched title %r and description %r. Expected unique!"
                    % (title, description))
                continue
            try:
                item.title = title
                item.schema = schema
                item.description = description
                item.url = item_url
                # Support both georss and xcal for getting the location name.
                # TODO: should also support ev:location per http://web.resource.org/rss/1.0/modules/event/
                item.location_name = entry.get(
                    'xCal_x-calconnect-street') or entry.get(
                        'x-calconnect-street') or entry.get(
                            'georss_featurename') or entry.get('featurename')
                item.item_date = datetime.datetime(*entry.updated_parsed[:6])
                item.pub_date = datetime.datetime(*entry.updated_parsed[:6])
                _short_title = item.title[:30] + '...'

                # feedparser bug: depending on which parser it magically uses,
                # we either get the xml namespace in the key name, or we don't.
                point = entry.get('georss_point') or entry.get('point')
                x, y = None, None
                if point:
                    # GeoRSS puts latitude (Y) first.
                    y, x = point.split(' ')
                else:
                    if item.location_name:
                        text = item.location_name
                    else:
                        # Geocode whatever we can find.
                        text = item.title + ' ' + item.description
                    logger.debug("...Falling back on geocoding from %r..." %
                                 text[:50])
                    addrs = parse_addresses(text)
                    for addr, unused in addrs:
                        try:
                            result = SmartGeocoder().geocode(addr)
                            point = result['point']
                            logger.debug("internally geocoded %r" % addr)
                            x, y = point.x, point.y
                            if not item.location_name:
                                item.location_name = result['address']
                            break
                        except GeocodingException:
                            logger.debug("Geocoding exception on %r:" % text)
                            log_exception(level=logging.DEBUG)
                            continue
                        except:
                            logger.error(
                                'uncaught geocoder exception on %r\n' % addr)
                            log_exception()
                    if None in (x, y):
                        logger.debug(
                            "Skip, couldn't geocode any addresses in item '%s...'"
                            % _short_title)
                        continue
                item.location = Point((float(x), float(y)))
                if not intersects_metro_bbox(item.location):
                    reversed_loc = Point((float(y), float(x)))
                    if intersects_metro_bbox(reversed_loc):
                        logger.info(
                            "Got points in apparently reverse order, flipping them"
                        )
                        item.location = reversed_loc
                    else:
                        logger.info("Skipping %r as %s,%s is out of bounds" %
                                    (_short_title, y, x))
                        continue
                if not item.location_name:
                    # Fall back to reverse-geocoding.
                    from ebpub.geocoder import reverse
                    try:
                        block, distance = reverse.reverse_geocode(
                            item.location)
                        logger.debug(" Reverse-geocoded point to %r" %
                                     block.pretty_name)
                        item.location_name = block.pretty_name
                    except reverse.ReverseGeocodeError:
                        logger.info(
                            " Skip, failed to reverse geocode %s for %r" %
                            (item.location.wkt, _short_title))
                        continue
                item.save()
                if status == 'added':
                    addcount += 1
                else:
                    updatecount += 1
                logger.info("%s: %s" % (status, _short_title))
            except:
                logger.error("Warning: couldn't save %r. Traceback:" %
                             _short_title)
                log_exception()
        logger.info("Finished LocalNewsScraper update: %d added, %d updated" %
                    (addcount, updatecount))
Exemple #37
0
def update(argv=None):
    logger.info("Starting add_news")
    if argv:
        url = argv[0]
    else:
        url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=massachusetts&scope=bonzai'
    schema_slug = 'local-news'

    try:
        schema = Schema.objects.get(slug=schema_slug)
    except Schema.DoesNotExist:
        logger.error("Schema (%s): DoesNotExist" % schema_slug)
        sys.exit(1)

    f = feedparser.parse(url)
    addcount = updatecount = 0
    for entry in f.entries:
        title = convert_entities(entry.title)
        description = convert_entities(entry.description)

        if entry.id.startswith('http'):
            item_url = entry.id
        else:
            item_url = entry.link
        try:
            item = NewsItem.objects.get(schema__id=schema.id,
                                        title=title,
                                        description=description)
            #url=item_url)
            status = 'updated'
        except NewsItem.DoesNotExist:
            item = NewsItem()
            status = 'added'
        except NewsItem.MultipleObjectsReturned:
            # Seen some where we get the same story with multiple URLs. Why?
            logger.warn(
                "Multiple entries matched title %r and description %r. Expected unique!"
                % (title, description))
            continue
        try:
            item.title = title
            item.schema = schema
            item.description = description
            item.url = item_url
            item.location_name = entry.get('x-calconnect-street') or entry.get(
                'georss_featurename')
            item.item_date = datetime.datetime(*entry.updated_parsed[:6])
            item.pub_date = datetime.datetime(*entry.updated_parsed[:6])

            # feedparser bug: depending on which parser it magically uses,
            # we either get the xml namespace in the key name, or we don't.
            point = entry.get('georss_point') or entry.get('point')
            x, y = None, None
            if point:
                x, y = point.split(' ')
            if True:
                # Fall back on geocoding.
                text = item.title + ' ' + item.description
                addrs = parse_addresses(text)
                for addr, unused in addrs:
                    try:
                        result = SmartGeocoder().geocode(addr)
                        point = result['point']
                        logger.debug("internally geocoded %r" % addr)
                        x, y = point.x, point.y
                        break
                    except (GeocodingException, ParsingError):
                        logger.debug("Geocoding exception on %r:" % text)
                        log_exception(level=logging.DEBUG)
                        continue
                    except:
                        logger.error('uncaught geocoder exception on %r\n' %
                                     addr)
                        log_exception()
                if None in (x, y):
                    logger.info("couldn't geocode '%s...'" % item.title[:30])
                    continue
            item.location = Point((float(y), float(x)))
            if item.location.x == 0.0 and item.location.y == 0.0:
                # There's a lot of these. Maybe attempt to
                # parse and geocode if we haven't already?
                logger.info("Skipping %r as it has bad location 0,0" %
                            item.title)
                continue
            if not item.location_name:
                # Fall back to reverse-geocoding.
                from ebpub.geocoder import reverse
                try:
                    block, distance = reverse.reverse_geocode(item.location)
                    logger.debug(" Reverse-geocoded point to %r" %
                                 block.pretty_name)
                    item.location_name = block.pretty_name
                except reverse.ReverseGeocodeError:
                    logger.debug(" Failed to reverse geocode %s for %r" %
                                 (item.location.wkt, item.title))
                    item.location_name = u''
            item.save()
            if status == 'added':
                addcount += 1
            else:
                updatecount += 1
            logger.info("%s: %s" % (status, item.title))
        except:
            logger.error("Warning: couldn't save %r. Traceback:" % item.title)
            log_exception()
    logger.info("Finished add_news: %d added, %d updated" %
                (addcount, updatecount))