Exemple #1
0
def main(argv=None):
    logger.info("Starting add_news")
    if argv:
        url = argv[0]
    else:
        url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=massachusetts&scope=bonzai'
    schema_slug = 'local-news'

    try:
        schema = Schema.objects.get(slug=schema_slug)
    except Schema.DoesNotExist:
        logger.error( "Schema (%s): DoesNotExist" % schema_slug)
        sys.exit(1)

    f = feedparser.parse(url)
    addcount = updatecount = 0
    for entry in f.entries:
        title = convert_entities(entry.title)
        description = convert_entities(entry.description)

        if entry.id.startswith('http'):
            item_url = entry.id
        else:
            item_url = entry.link
        try:
            item = NewsItem.objects.get(schema__id=schema.id,
                                        title=title,
                                        description=description)
            #url=item_url)
            status = 'updated'
        except NewsItem.DoesNotExist:
            item = NewsItem()
            status = 'added'
        except NewsItem.MultipleObjectsReturned:
            # Seen some where we get the same story with multiple URLs. Why?
            logger.warn("Multiple entries matched title %r and description %r. Expected unique!" % (title, description))
            continue
        try:
            item.title = title
            item.schema = schema
            item.description = description
            item.url = item_url
            item.location_name = entry.get('x-calconnect-street') or entry.get('georss_featurename')
            item.item_date = datetime.datetime(*entry.updated_parsed[:6])
            item.pub_date = datetime.datetime(*entry.updated_parsed[:6])

            # feedparser bug: depending on which parser it magically uses,
            # we either get the xml namespace in the key name, or we don't.
            point = entry.get('georss_point') or entry.get('point')
            if point:
                x, y = point.split(' ')
            else:
                # Fall back on geocoding.
                text = item.title + ' ' + item.description
                try:
                    x, y = quick_dirty_fallback_geocode(text, parse=True)
                except GeocodingException:
                    logger.debug("Geocoding exception on %r:" % text)
                    log_exception(level=logging.DEBUG)
                    continue
                if None in (x, y):
                    logger.info("couldn't geocode '%s...'" % item.title[:30])
                    continue
            item.location = Point((float(y), float(x)))
            if item.location.x == 0.0 and item.location.y == 0.0:
                # There's a lot of these. Maybe attempt to
                # parse and geocode if we haven't already?
                logger.info("Skipping %r as it has bad location 0,0" % item.title)
                continue
            if not item.location_name:
                # Fall back to reverse-geocoding.
                from ebpub.geocoder import reverse
                try:
                    block, distance = reverse.reverse_geocode(item.location)
                    logger.debug(" Reverse-geocoded point to %r" % block.pretty_name)
                    item.location_name = block.pretty_name
                    item.block = block
                except reverse.ReverseGeocodeError:
                    logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title))
                    item.location_name = u''
            item.save()
            if status == 'added':
                addcount += 1
            else:
                updatecount += 1
            logger.info("%s: %s" % (status, item.title))
        except:
            logger.error("Warning: couldn't save %r. Traceback:" % item.title)
            log_exception()
    logger.info("Finished add_news: %d added, %d updated" % (addcount, updatecount))
Exemple #2
0
def main(argv=None):
    if argv:
        url = argv[0]
    else:
        url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=massachusetts&scope=bonzai'
    schema = 'local-news'

    try:
        schema = Schema.objects.get(slug=schema)
    except Schema.DoesNotExist:
        print "Schema (%s): DoesNotExist" % schema
        sys.exit(1)

    f = feedparser.parse(url)

    for e in f.entries:
        try:
            item = NewsItem.objects.get(title=e.title, description=e.description)
            print "Already have %r (id %d)" % (item.title, item.id)
        except NewsItem.DoesNotExist:
            item = NewsItem()
        try:
            item.schema = schema
            item.title = convert_entities(e.title)
            item.description = convert_entities(e.description)
            item.url = e.link
            item.location_name = e.get('x-calconnect-street') or e.get('georss_featurename')
            item.item_date = datetime.datetime(*e.updated_parsed[:6])
            item.pub_date = datetime.datetime(*e.updated_parsed[:6])
            if 'point' in e:
                x,y = e.point.split(' ')
            elif 'georss_point' in e:
                x,y = e.georss_point.split(' ')
            else:
                text = item.title + ' ' + item.description
                from geocoder_hack import quick_dirty_fallback_geocode
                x, y = quick_dirty_fallback_geocode(text, parse=True)
                if None in (x, y):
                    print " couldn't geocode '%s...'" % item.title[:30]
                    continue
            item.location = Point((float(y), float(x)))
            if item.location.x == 0.0 and item.location.y == 0.0:
                # There's a lot of these. Maybe attempt to
                # parse and geocode if we haven't already?
                print "Skipping %r as it has bad location 0,0" % item.title
                continue
            if not item.location_name:
                # Fall back to reverse-geocoding.
                from ebpub.geocoder import reverse
                try:
                    block, distance = reverse.reverse_geocode(item.location)
                    print " Reverse-geocoded point to %r" % block.pretty_name
                    item.location_name = block.pretty_name
                except reverse.ReverseGeocodeError:
                    print " Failed to reverse geocode %s for %r" % (item.location.wkt, item.title)
                    item.location_name = u''
            item.save()
            print "Added: %s" % item.title
        except:
            print "Warning: couldn't save %r. Traceback:" % item.title
            import cStringIO, traceback
            f = cStringIO.StringIO()
            traceback.print_exc(file=f)
            msg = f.getvalue()
            print msg
    def unique_fields(self, list_record):
        # not necessarily primary key, but for this script's purposes
        # these are the fields that in combination uniquely idenfity
        # an article.
        date = datetime.date(*list_record['updated_parsed'][:3])
        precincts = ['A1', 'A7', 'B2', 'B3', 'C11', 'C6', 'D14', 'D4',
                     'E13', 'E18', 'E5']
        precinct = None
        tags = [t['term'] for t in list_record['tags']]
        if not tags:
            return
        for precinct in tags:
            if precinct in precincts:
                # TODO: we need a LocationType for precincts, and shapes; and
                # then we can set newsitem.location_object to the Location
                # for this precinct.
                break

        if not precinct:
            self.logger.debug("no precinct found in tags %r" % tags)

        if 'Boston 24' in tags:
            # TODO: the 'Boston 24' tag indicates posts with aggregate
            # daily stats.  Make a separate schema for aggregates,
            # with attributes like those used in
            # everyblock/everyblock/cities/nyc/crime_aggregate/retrieval.py.
            # These are citywide though, not by precinct.
            # So what would be the Location?  Whole city??
            self.logger.info("boston daily crime stats, we don't know how to "
                             "handle these yet")

        description = list_record['content'][0]['value']
        # TODO: we should have a stock 'clean up html' function.
        description = preprocess_to_string(
            description,
            drop_tags=('a', 'area', 'b', 'center', 'font', 'form', 'img', 'input', 'p', 'strong', 'map', 'small', 'span', 'sub', 'sup', 'topic', 'u'),
            drop_trees=('applet', 'button', 'embed', 'iframe', 'object', 'select', 'textarea'),
            drop_attrs=('background', 'border', 'cellpadding', 'cellspacing', 'class', 'clear', 'id', 'rel', 'style', 'target'))
        from ebdata.retrieval.utils import convert_entities
        description = convert_entities(description)
        #description = description.replace(' ', ' ').replace(' ', ' ')

        addrs = parse_addresses(description)
        if not addrs:
            self.logger.info("no addresses found in %r" % list_record['title'])

        location = None
        location_name = u''
        for addr, unused in addrs:
            addr = addr.strip()
            try:
                from geocoder_hack import quick_dirty_fallback_geocode
                x, y = quick_dirty_fallback_geocode(addr)
                if (x, y) != (None, None):
                    location = Point((float(x), float(y)))
                    location_name = addr.title()
            except:
                print "ugh, %r" % addr
                # XXX log something

        return dict(item_date=date,
                    location=location,
                    location_name=location_name,
                    title=list_record['title'],
                    description=description,
                    )
Exemple #4
0
def main(argv=None):
    logger.info("Starting add_news")
    if argv:
        url = argv[0]
    else:
        url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=massachusetts&scope=bonzai'
    schema_slug = 'local-news'

    try:
        schema = Schema.objects.get(slug=schema_slug)
    except Schema.DoesNotExist:
        logger.error("Schema (%s): DoesNotExist" % schema_slug)
        sys.exit(1)

    f = feedparser.parse(url)
    addcount = updatecount = 0
    for entry in f.entries:
        title = convert_entities(entry.title)
        description = convert_entities(entry.description)

        if entry.id.startswith('http'):
            item_url = entry.id
        else:
            item_url = entry.link
        try:
            item = NewsItem.objects.get(schema__id=schema.id,
                                        title=title,
                                        description=description)
            #url=item_url)
            status = 'updated'
        except NewsItem.DoesNotExist:
            item = NewsItem()
            status = 'added'
        except NewsItem.MultipleObjectsReturned:
            # Seen some where we get the same story with multiple URLs. Why?
            logger.warn(
                "Multiple entries matched title %r and description %r. Expected unique!"
                % (title, description))
            continue
        try:
            item.title = title
            item.schema = schema
            item.description = description
            item.url = item_url
            item.location_name = entry.get('x-calconnect-street') or entry.get(
                'georss_featurename')
            item.item_date = datetime.datetime(*entry.updated_parsed[:6])
            item.pub_date = datetime.datetime(*entry.updated_parsed[:6])

            # feedparser bug: depending on which parser it magically uses,
            # we either get the xml namespace in the key name, or we don't.
            point = entry.get('georss_point') or entry.get('point')
            if point:
                x, y = point.split(' ')
            else:
                # Fall back on geocoding.
                text = item.title + ' ' + item.description
                try:
                    x, y = quick_dirty_fallback_geocode(text, parse=True)
                except GeocodingException:
                    logger.debug("Geocoding exception on %r:" % text)
                    log_exception(level=logging.DEBUG)
                    continue
                if None in (x, y):
                    logger.info("couldn't geocode '%s...'" % item.title[:30])
                    continue
            item.location = Point((float(y), float(x)))
            if item.location.x == 0.0 and item.location.y == 0.0:
                # There's a lot of these. Maybe attempt to
                # parse and geocode if we haven't already?
                logger.info("Skipping %r as it has bad location 0,0" %
                            item.title)
                continue
            if not item.location_name:
                # Fall back to reverse-geocoding.
                from ebpub.geocoder import reverse
                try:
                    block, distance = reverse.reverse_geocode(item.location)
                    logger.debug(" Reverse-geocoded point to %r" %
                                 block.pretty_name)
                    item.location_name = block.pretty_name
                    item.block = block
                except reverse.ReverseGeocodeError:
                    logger.debug(" Failed to reverse geocode %s for %r" %
                                 (item.location.wkt, item.title))
                    item.location_name = u''
            item.save()
            if status == 'added':
                addcount += 1
            else:
                updatecount += 1
            logger.info("%s: %s" % (status, item.title))
        except:
            logger.error("Warning: couldn't save %r. Traceback:" % item.title)
            log_exception()
    logger.info("Finished add_news: %d added, %d updated" %
                (addcount, updatecount))