def save(self, old_record, list_record, detail_record):
        kwargs = self.unique_fields(list_record)

        location = Point((float(list_record['geo_long']),
                          float(list_record['geo_lat'])))

        if (location.x, location.y) == (0,0, 0.0):
            print "skipping %r as it has bad location 0,0" % list_record['title']
            return

        # remove address and rating from summary.
        summary_detail = list_record['summary_detail']['value']
        content = list_record['summary']
        content = address_re.sub('', content)
        rating = rating_re.search(content)
        attributes = None
        if rating:
            rating = int(rating.group(1))
            attributes = {'rating': rating}
            content = rating_re.sub('', content)

        content = preprocess_to_string(content, drop_tags=('p', 'br', 'b',))
        kwargs.update(dict(description=content,
                           location=location,
                           ))
        if old_record:
            self.update_existing(old_record, kwargs, attributes)
        else:
            self.create_newsitem(attributes=attributes, **kwargs)
Example #2
0
    def unique_fields(self, list_record):
        # not necessarily primary key, but for this script's purposes
        # these are the fields that in combination uniquely idenfity
        # an article.
        date = datetime.date(*list_record['updated_parsed'][:3])
        precincts = ['A1', 'A7', 'B2', 'B3', 'C11', 'C6', 'D14', 'D4',
                     'E13', 'E18', 'E5']
        precinct = None
        tags = [t['term'] for t in list_record['tags']]
        if not tags:
            return
        for precinct in tags:
            if precinct in precincts:
                # TODO: we need a LocationType for precincts, and shapes; and
                # then we can set newsitem.location_object to the Location
                # for this precinct.
                break

        if not precinct:
            self.logger.debug("no precinct found in tags %r" % tags)

        if 'Boston 24' in tags:
            # TODO: the 'Boston 24' tag indicates posts with aggregate
            # daily stats.  Make a separate schema for aggregates,
            # with attributes like those used in
            # everyblock/everyblock/cities/nyc/crime_aggregate/retrieval.py.
            # These are citywide though, not by precinct.
            # So what would be the Location?  Whole city??
            self.logger.info("boston daily crime stats, we don't know how to "
                             "handle these yet")

        description = list_record['content'][0]['value']
        # TODO: we should have a stock 'clean up html' function.
        description = preprocess_to_string(
            description,
            drop_tags=('a', 'area', 'b', 'center', 'font', 'form', 'img', 'input', 'p', 'strong', 'map', 'small', 'span', 'sub', 'sup', 'topic', 'u'),
            drop_trees=('applet', 'button', 'embed', 'iframe', 'object', 'select', 'textarea'),
            drop_attrs=('background', 'border', 'cellpadding', 'cellspacing', 'class', 'clear', 'id', 'rel', 'style', 'target'))
        from ebdata.retrieval.utils import convert_entities
        description = convert_entities(description)
        #description = description.replace(' ', ' ').replace(' ', ' ')

        addrs = parse_addresses(description)
        if not addrs:
            self.logger.info("no addresses found in %r" % list_record['title'])

        location = None
        location_name = u''
        for addr, unused in addrs:
            addr = addr.strip()
            try:
                from geocoder_hack import quick_dirty_fallback_geocode
                x, y = quick_dirty_fallback_geocode(addr)
                if (x, y) != (None, None):
                    location = Point((float(x), float(y)))
                    location_name = addr.title()
            except:
                print "ugh, %r" % addr
                # XXX log something

        return dict(item_date=date,
                    location=location,
                    location_name=location_name,
                    title=list_record['title'],
                    description=description,
                    )