コード例 #1
0
ファイル: random_news.py プロジェクト: egrommet/openblock
def main(count):
    schema = 'local-news'

    locations = list(Location.objects.all())
    random.shuffle(locations)

    try:
        schema = Schema.objects.get(slug=schema)
    except Schema.DoesNotExist:
        print "Schema (%s): DoesNotExist" % schema
        sys.exit(0)
        
    for i in range(int(count)):
        item = NewsItem()
        item.schema = schema
        item.title = '%d Random News %s' % (i, uuid.uuid1())
        item.description = item.title + ' blah' * 100
        item.url = 'http://example.com'
        # Random time between now and one week ago.
        date = datetime.datetime.now() - datetime.timedelta(random.uniform(-7.0, 0.0))
        item.pub_date = item.item_date = date

        # Pick a random location from the ones we know.
        location = locations[i % len(locations)]
        item.location_object = location
        item.location_name = location.name
        # It would be cool to pick a random location within the bounds,
        # but that would take thought... use the center.
        try:
            item.location = location.location.centroid
        except AttributeError:
            print "whoops"
            continue
        print "Added: %s at %s (%s)" % (item.title, location.name, item.location.wkt)
        item.save()
コード例 #2
0
ファイル: add_events.py プロジェクト: Helpershub/everyblock
def main():
    """ Download Calendar RSS feed and update database """

    url = """http://calendar.boston.com/search?acat=&cat=&commit=Search\
&new=n&rss=1&search=true&sort=0&srad=20&srss=50&ssrss=5&st=event\
&st_select=any&svt=text&swhat=&swhen=today&swhere=&trim=1"""
    schema = 'events'
    
    parser = OptionParser()
    parser.add_option('-q', '--quiet', action="store_true", dest="quiet", 
        default=False, help="no output")
        
    (options, args) = parser.parse_args()

    if len(args) > 0:
        return parser.error('script does not take any arguments')
    
    try:
        schema = Schema.objects.get(slug=schema)
    except Schema.DoesNotExist:
        print "Schema (%s): DoesNotExist" % schema
        sys.exit(0)
        
    feed = feedparser.parse(url)
    
    for entry in feed.entries:
        try:
            item = NewsItem.objects.get(title=entry.title, 
                description=entry.description)
            status = "Updated"
        except NewsItem.DoesNotExist:
            item = NewsItem()
            status = "Added"
        
        try:
            item.schema = schema
            item.title = entry.title
            item.description = entry.description
            item.url = entry.link
            item.item_date = datetime.datetime(*entry.updated_parsed[:6])
            item.pub_date = datetime.datetime(*entry.updated_parsed[:6])
            item.location = Point((float(entry['geo_long']), 
                float(entry['geo_lat'])))
            item.save()
            if not options.quiet:
                print "%s: %s" % (status, item.title)
        except ValueError:
            if not options.quiet:
                print "unexpected error:", sys.exc_info()[1]
コード例 #3
0
ファイル: add_news.py プロジェクト: Helpershub/everyblock
def main(argv=None):
    url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=boston&scope=bonzai'
    schema = 'local-news'
    
    try:
        schema = Schema.objects.get(slug=schema)
    except Schema.DoesNotExist:
        print "Schema (%s): DoesNotExist" % schema
        sys.exit(0)
        
    f = feedparser.parse(url)
    geocoder = SmartGeocoder()
    
    for e in f.entries:
        try:
            item = NewsItem.objects.get(title=e.title, description=e.description)
        except NewsItem.DoesNotExist:
            item = NewsItem()
            item.schema = schema
            item.title = e.title
            item.description = e.description
            item.url = e.link
            #item.location_name = e['x-calconnect-street']
            item.item_date = datetime.datetime(*e.updated_parsed[:6])
            item.pub_date = datetime.datetime(*e.updated_parsed[:6])
        
            try:
                if 'point' in e:
                    x,y = e.point.split(' ')
                else:
                    x,y = e.georss_point.split(' ')
                item.location = Point((float(y), float(x)))
                item.save()
            except:
                pass
        
            print "Added: %s" % item.title
コード例 #4
0
def _make_items(number, schema):
    items = []
    from django.conf import settings
    local_tz = pytz.timezone(settings.TIME_ZONE)
    curdate = datetime.datetime.now().replace(microsecond=0, tzinfo=local_tz)
    inc = datetime.timedelta(days=-1)
    for i in range(number):
        desc = '%s item %d' % (schema.slug, i)
        items.append(NewsItem(schema=schema, title=desc,
                              description=desc,
                              item_date=curdate.date(),
                              pub_date=curdate,
                              location=geos.Point(0,0)))
        curdate += inc
    return items
コード例 #5
0
ファイル: eb.py プロジェクト: christaggart/openblock
    def render(self, context):
        schema_id = self.schema_id_variable.resolve(context)
        newsitem_id = self.newsitem_id_variable.resolve(context)
        att_value = self.att_value_variable.resolve(context)
        sf = SchemaField.objects.select_related().get(schema__id=schema_id, name=self.att_name)
        ni_list = NewsItem.objects_by_schema(sf.schema).exclude(id=newsitem_id).by_attribute(sf, att_value).order_by('-item_date')
        #populate_attributes_if_needed(ni_list, [sf.schema])

        # We're assigning directly to context.dicts[-1] so that the variable
        # gets set in the top-most context in the context stack. If we didn't
        # do this, the variable would only be available within the specific
        # {% block %} from which the template tag was called, because the
        # {% block %} implementation does a context.push() and context.pop().
        context.dicts[-1][self.context_var] = ni_list

        return ''
コード例 #6
0
 def parse_entry(self, entry, title):
     try:
         item = NewsItem.objects.get(title=title, schema__id=self.schema.id)
     except NewsItem.DoesNotExist:
         item = NewsItem(title=title, schema=self.schema)
     description = convert_entities(entry.description)
     try:
         location, description = description.split(' -- ', 1)
     except ValueError:
         logger.error("Unable to parse description: %s", description)
         return
     item.url = entry.link
     item.description = description
     item.pub_date = datetime.datetime(*entry.updated_parsed[:6])
     try:
         item.location_name = self.geocoder.geocode(location)
     except geocoder.DoesNotExist:
         logger.error("Failed to geocode %s" % location)
         item.location_name = location
     created = item.pk is not None
     item.save()
     return created
コード例 #7
0
ファイル: random_news.py プロジェクト: DotNetWebs/openblock
def save_random_newsitem(schema, i, block, future=False):
    title = '%d Random %s %s' % (i, schema.name, uuid.uuid4())
    print "Creating %r" % title
    item = NewsItem()
    item.title = title
    item.schema = schema
    item.description = gibberis.ch.freeform.random_text(get_text_corpus(), 300)
    item.url = 'http://example.com/%s/%d' % (schema.slug, i)
    if future:
        date = random_datetime(7.0)
    else:
        date = random_datetime(-7.0)
    item.pub_date = date
    item.item_date = date.date()
    item.location_name = block.pretty_name
    try:
        item.location = block.geom.centroid
    except AttributeError:
        item.location = block.geom
    # Populate the attributes.

    attrs = {}
    for schemafield in schema.schemafield_set.all():
        attrs[schemafield.name] = random_schemafield_value(schemafield)

    print "Added: %s on %s at %s (%s)" % (item.title, item.item_date, item.location_name, item.location.wkt)

    # Need to save before we can have foreign keys from the attributes
    # or subclass.
    item.save()
    if attrs:
        item.attributes = attrs
        # That implicitly saves in the old model, but not the new.
        item.save()
コード例 #8
0
ファイル: retrieval.py プロジェクト: mesrut/openblock
    def update(self):
        logger.info("Starting LocalNewsScraper update %s" % self.url)

        try:
            schema = Schema.objects.get(slug=self.schema_slug)
        except Schema.DoesNotExist:
            logger.error( "Schema (%s): DoesNotExist" % self.schema_slug)
            return 1

        response, content = self.http.request(self.url)
        if response.fromcache:
            logger.info("Feed is unchanged since last update (cached)")
            return

        f = feedparser.parse(content)
        addcount = updatecount = 0
        for entry in f.entries:
            title = convert_entities(entry.title)
            description = convert_entities(entry.description)

            if entry.id.startswith('http'):
                item_url = entry.id
            else:
                item_url = entry.link
            try:
                item = NewsItem.objects.get(schema__id=schema.id,
                                            title=title,
                                            description=description)
                #url=item_url)
                status = 'updated'
            except NewsItem.DoesNotExist:
                item = NewsItem()
                status = 'added'
            except NewsItem.MultipleObjectsReturned:
                # Seen some where we get the same story with multiple URLs. Why?
                logger.warn("Multiple entries matched title %r and description %r. Expected unique!" % (title, description))
                continue
            try:
                item.title = title
                item.schema = schema
                item.description = description
                item.url = item_url
                # Support both georss and xcal for getting the location name.
                # TODO: should also support ev:location per http://web.resource.org/rss/1.0/modules/event/
                item.location_name = entry.get('xCal_x-calconnect-street') or entry.get('x-calconnect-street') or entry.get('georss_featurename') or entry.get('featurename')
                item.item_date = datetime.datetime(*entry.updated_parsed[:6])
                item.pub_date = datetime.datetime(*entry.updated_parsed[:6])
                _short_title = item.title[:30] + '...'

                # feedparser bug: depending on which parser it magically uses,
                # we either get the xml namespace in the key name, or we don't.
                point = entry.get('georss_point') or entry.get('point')
                x, y = None, None
                if point:
                    # GeoRSS puts latitude (Y) first.
                    y, x = point.split(' ')
                else:
                    if item.location_name:
                        text = item.location_name
                    else:
                        # Geocode whatever we can find.
                        text = item.title + ' ' + item.description
                    logger.debug("...Falling back on geocoding from %r..." % text[:50])
                    addrs = parse_addresses(text)
                    for addr, unused in addrs:
                        try:
                            result = SmartGeocoder().geocode(addr)
                            point = result['point']
                            logger.debug("internally geocoded %r" % addr)
                            x, y = point.x, point.y
                            if not item.location_name:
                                item.location_name = result['address']
                            item.block = result['block']
                            break
                        except GeocodingException:
                            logger.debug("Geocoding exception on %r:" % text)
                            log_exception(level=logging.DEBUG)
                            continue
                        except:
                            logger.error('uncaught geocoder exception on %r\n' % addr)
                            log_exception()
                    if None in (x, y):
                        logger.debug("Skip, couldn't geocode any addresses in item '%s...'"
                                     % _short_title)
                        continue
                item.location = Point((float(x), float(y)))
                if not intersects_metro_bbox(item.location):
                    reversed_loc = Point((float(y), float(x)))
                    if intersects_metro_bbox(reversed_loc):
                        logger.info(
                            "Got points in apparently reverse order, flipping them")
                        item.location = reversed_loc
                    else:
                        logger.info("Skipping %r as %s,%s is out of bounds" %
                                    (_short_title, y, x))
                        continue
                if not item.location_name:
                    # Fall back to reverse-geocoding.
                    from ebpub.geocoder import reverse
                    try:
                        block, distance = reverse.reverse_geocode(item.location)
                        logger.debug(" Reverse-geocoded point to %r" % block.pretty_name)
                        item.location_name = block.pretty_name
                        item.block = block
                    except reverse.ReverseGeocodeError:
                        logger.info(" Skip, failed to reverse geocode %s for %r" % (item.location.wkt, _short_title))
                        continue
                item.save()
                if status == 'added':
                    addcount += 1
                else:
                    updatecount += 1
                logger.info("%s: %s" % (status, _short_title))
            except:
                logger.error("Warning: couldn't save %r. Traceback:" % _short_title)
                log_exception()
        logger.info("Finished LocalNewsScraper update: %d added, %d updated" % (addcount, updatecount))
コード例 #9
0
    # custom field crime type
    crime_name = SchemaField()
    crime_name.schema = crime_report
    crime_name.pretty_name = "Crime Type"
    crime_name.pretty_plural_name = "Crime Types"
    crime_name.real_name = "varchar02"
    crime_name.name = "crime_type"
    crime_name.save()

    # custom field crime code
    crime_code = SchemaField()
    crime_code.schema = crime_report
    crime_code.pretty_name = "Crime Code"
    crime_code.pretty_plural_name = "Crime Codes"
    crime_code.real_name = "int01"
    crime_code.name = "crime_code"
    crime_code.save()

    # create a Crime Report!
    report = NewsItem()
    report.schema = crime_report
    report.title = "Hooligans causing disturbance downtown"
    report.location_name = "123 Fakey St."
    report.item_date = datetime.utcnow()
    report.pub_date = datetime.utcnow()
    report.description = "Blah Blah Blah"
    report.save()
    report.attributes['officer'] = "John Smith"
    report.attributes['crime_type'] = "Disturbing The Peace"
    report.attributes['crime_code'] = 187
コード例 #10
0
ファイル: retrieval.py プロジェクト: mesrut/openblock
    def update(self):
        """ Download Calendar RSS feed and update database """
        logger.info("Starting EventsCalendarScraper")
        
        feed = feedparser.parse(self.url)
        seencount = addcount = updatecount = 0
        for entry in feed.entries:

            def ns_get(element):
                # work around feedparser unpredictability.
                namespace, element = element.split(':')
                result = entry.get('%s_%s' % (namespace, element))
                if result is None:
                    result = entry.get(element)
                return result

            seencount += 1
            title = convert_entities(entry.title)
            try:
                item = NewsItem.objects.get(title=title,
                                            schema__id=self.schema.id)
                status = "updated"
            except NewsItem.DoesNotExist:
                item = NewsItem()
                status = "added"
            except NewsItem.MultipleObjectsReturned:
                logger.warn("Multiple entries matched title %r, event titles are not unique?" % title)
                continue
            try:
                item.location_name = '%s %s' % (ns_get('xcal:x-calconnect-venue-name'),
                                                ns_get('xcal:x-calconnect-street'))
                item.location_name = item.location_name.strip()
                item.schema = self.schema
                item.title = title
                item.description = convert_entities(entry.description)
                item.url = entry.link
                start_dt = ns_get('xcal:dtstart')
                start_dt = dateutil.parser.parse(start_dt)
                # Upstream bug: They provide a UTC offset of +0000 which
                # means times in UTC, but they're actually times in
                # US/Eastern, so do *not* fix the zone.
                #start_dt = start_dt.astimezone(local_tz)
                item.item_date = start_dt.date()
                item.pub_date = datetime.datetime(*entry.updated_parsed[:6])
                item.location = Point((float(ns_get('geo:long')),
                                       float(ns_get('geo:lat'))))
                if (item.location.x, item.location.y) == (0.0, 0.0):
                    logger.warn("Skipping %r, bad location 0,0" % item.title)
                    continue

                if not item.location_name:
                    # Fall back to reverse-geocoding.
                    from ebpub.geocoder import reverse
                    try:
                        block, distance = reverse.reverse_geocode(item.location)
                        logger.info(" Reverse-geocoded point to %r" % block.pretty_name)
                        item.location_name = block.pretty_name
                        item.block = block
                    except reverse.ReverseGeocodeError:
                        logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title))
                        item.location_name = u''

                item.save()
                item.attributes['start_time'] = start_dt.time()
                end_dt = ns_get('xcal:dtend') or u''
                if end_dt.strip():
                    end_dt = dateutil.parser.parse(end_dt.strip())
                    #end_dt = end_dt.astimezone(local_tz)
                    item.attributes['end_time'] = end_dt.time()
                if status == 'added':
                    addcount += 1
                else:
                    updatecount += 1
                logger.info("%s: %s" % (status, item.title))
            except:
                logger.error("unexpected error:", sys.exc_info()[1])
                log_exception()
        logger.info("EventsCalendarScraper finished: %d added, %d updated of %s total" % (addcount, updatecount, seencount))
コード例 #11
0
ファイル: georeportv2.py プロジェクト: peudadayusuf/openblock
    def _update_service_request(self, sreq):
        service_request_id = self._get_request_field(sreq, 'service_request_id')

        if not service_request_id:
            log.info("Skipping request with no request id (may be in progress)!")
            return


        # pull out the location first, if we can't do this, we don't want it.
        try:
            point = Point(float(sreq.find('long').text), 
                          float(sreq.find('lat').text),
                          srid=4326)
        except: 
            log.debug("Skipping request with invalid location (%s)" % service_request_id)
            return
        if self.bounds is not None:
            if not self.bounds.intersects(point):
                log.debug("Skipping request at %s, outside bounds" % point)
                return
        try:
            ni = NewsItem.objects.filter(schema=self.schema).by_attribute(self.service_request_id_field, 
                                                                          service_request_id).all()[0]
            log.info('updating existing request %s' % service_request_id)
        except IndexError:
            # create the NewsItem
            ni = NewsItem(schema=self.schema)
            log.info('created new service request %s' % service_request_id)

        ni.title = self._get_request_field(sreq, 'service_name')
        ni.description = self._get_request_field(sreq, 'description')
        ni.location = point
        ni.location_name = self._get_request_field(sreq, 'address')
        # try to reverse geocde this point
        if not ni.location_name:
            try:
                block, distance = reverse_geocode(ni.location)
                ni.location_name = block.pretty_name
            except:
                log.debug("Failed to reverse geocode item %s" % service_request_id)

        # try to pull the requested_datetime into pubdate/itemdate
        # default to now.
        try: 
            ni.pub_date = pyrfc3339.parse(sreq.find('requested_datetime').text)
        except:
            ni.pub_date = datetime.datetime.utcnow()
            log.info("Filling in current time for pub_date on item with no requested_datetime (%s)" % service_request_id)
        ni.item_date = datetime.date(ni.pub_date.year, ni.pub_date.month, ni.pub_date.day)

        if self.html_url_template:
            ni.url = self.html_url_template.replace('{id}', service_request_id)
            log.info('Assigning html url "%s" to %s' % (ni.url, service_request_id))

        ni.save()

        ni.attributes['service_request_id'] = service_request_id

        # varchar fields
        for fieldname in ('request_id', 'service_code', 'address_id',
                          'media_url', 'status_notes', 'service_notice'):
            val = self._get_request_field(sreq, fieldname)
            if val != '':
                if len(val) < 4096:
                    ni.attributes[fieldname] = val
                else: 
                    log.info("truncating value for %s (%s)" % (fieldname, val))
                    ni.attributes[fieldname] = val[0:4096]

        # text fields
        for fieldname in ('service_notice'):
            val = self._get_request_field(sreq, fieldname)
            if val != '':
                ni.attributes[fieldname] = val

        
        # datetime fields
        for fieldname in ('expected_datetime', 'requested_datetime'):
            val = self._get_request_field(sreq, fieldname)
            if val == '':
                continue

            # try to parse it
            try:
                ni.attributes[fieldname] = pyrfc3339.parse(val) 
            except ValueError: 
                # invalid date, just omit
                log.info('Omitting invalid datetime field %s = %s' % (fieldname, val))
                pass
        
        # lookups 
        for fieldname in ('service_name', 'agency_responsible', 'status'):
            val = self._get_request_field(sreq, fieldname)
            if val == '': 
                ni.attributes[fieldname] = self._lookup_for(fieldname, 'Unknown')
            ni.attributes[fieldname] = self._lookup_for(fieldname, val)
コード例 #12
0
def save_random_newsitem(schema, i, block):
    title = '%d Random %s %s' % (i, schema.name, uuid.uuid4())
    print "Creating %r" % title
    item = NewsItem()
    item.title = title
    item.schema = schema
    item.description = gibberis.ch.freeform.random_text(get_text_corpus(), 300)
    item.url = 'http://example.com/%s/%d' % (schema.slug, i)
    date = random_datetime(7.0)
    item.pub_date = date
    item.item_date = date.date()
    item.location_name = block.pretty_name
    try:
        item.location = block.geom.centroid
    except AttributeError:
        item.location = block.geom
    # Populate the attributes.

    attrs = {}
    for schemafield in schema.schemafield_set.all():
        attrs[schemafield.name] = random_schemafield_value(schemafield)

    print "Added: %s at %s (%s)" % (item.title, item.location_name,
                                    item.location.wkt)

    # Need to save before we can have foreign keys from the attributes
    # or subclass.
    item.save()
    if attrs:
        item.attributes = attrs
        # That implicitly saves in the old model, but not the new.
        item.save()
コード例 #13
0
    def update(self):
        """ Download Calendar RSS feed and update database """
        logger.info("Starting EventsCalendarScraper")

        feed = feedparser.parse(self.url)
        seencount = addcount = updatecount = 0
        for entry in feed.entries:

            def ns_get(element):
                # work around feedparser unpredictability.
                namespace, element = element.split(':')
                result = entry.get('%s_%s' % (namespace, element))
                if result is None:
                    result = entry.get(element)
                return result

            seencount += 1
            title = convert_entities(entry.title)
            try:
                item = NewsItem.objects.get(title=title,
                                            schema__id=self.schema.id)
                status = "updated"
            except NewsItem.DoesNotExist:
                item = NewsItem()
                status = "added"
            except NewsItem.MultipleObjectsReturned:
                logger.warn(
                    "Multiple entries matched title %r, event titles are not unique?"
                    % title)
                continue
            try:
                item.location_name = '%s %s' % (
                    ns_get('xcal:x-calconnect-venue-name'),
                    ns_get('xcal:x-calconnect-street'))
                item.location_name = item.location_name.strip()
                item.schema = self.schema
                item.title = title
                item.description = convert_entities(entry.description)
                item.url = entry.link
                start_dt = ns_get('xcal:dtstart')
                start_dt = dateutil.parser.parse(start_dt)
                # Upstream bug: They provide a UTC offset of +0000 which
                # means times in UTC, but they're actually times in
                # US/Eastern, so do *not* fix the zone.
                #start_dt = start_dt.astimezone(local_tz)
                item.item_date = start_dt.date()
                item.pub_date = datetime.datetime(*entry.updated_parsed[:6])
                item.location = Point(
                    (float(ns_get('geo:long')), float(ns_get('geo:lat'))))
                if (item.location.x, item.location.y) == (0.0, 0.0):
                    logger.warn("Skipping %r, bad location 0,0" % item.title)
                    continue

                if not item.location_name:
                    # Fall back to reverse-geocoding.
                    from ebpub.geocoder import reverse
                    try:
                        block, distance = reverse.reverse_geocode(
                            item.location)
                        logger.info(" Reverse-geocoded point to %r" %
                                    block.pretty_name)
                        item.location_name = block.pretty_name
                    except reverse.ReverseGeocodeError:
                        logger.debug(" Failed to reverse geocode %s for %r" %
                                     (item.location.wkt, item.title))
                        item.location_name = u''

                item.save()
                item.attributes['start_time'] = start_dt.time()
                end_dt = ns_get('xcal:dtend') or u''
                if end_dt.strip():
                    end_dt = dateutil.parser.parse(end_dt.strip())
                    #end_dt = end_dt.astimezone(local_tz)
                    item.attributes['end_time'] = end_dt.time()
                if status == 'added':
                    addcount += 1
                else:
                    updatecount += 1
                logger.info("%s: %s" % (status, item.title))
            except Exception as e:
                logger.exception("unexpected error: %s" % e)
        logger.info(
            "EventsCalendarScraper finished: %d added, %d updated of %s total"
            % (addcount, updatecount, seencount))
コード例 #14
0
    # custom field crime type 
    crime_name = SchemaField()
    crime_name.schema = crime_report
    crime_name.pretty_name = "Crime Type"
    crime_name.pretty_plural_name = "Crime Types"
    crime_name.real_name = "varchar02"
    crime_name.name = "crime_type"
    crime_name.save()

    # custom field crime code
    crime_code = SchemaField()
    crime_code.schema = crime_report
    crime_code.pretty_name = "Crime Code"
    crime_code.pretty_plural_name = "Crime Codes"
    crime_code.real_name = "int01"
    crime_code.name = "crime_code"
    crime_code.save()

    # create a Crime Report!
    report = NewsItem()
    report.schema = crime_report
    report.title = "Hooligans causing disturbance downtown"
    report.location_name = "123 Fakey St."
    report.item_date = datetime.utcnow()
    report.pub_date = datetime.utcnow()
    report.description = "Blah Blah Blah"
    report.save()
    report.attributes['officer'] = "John Smith"
    report.attributes['crime_type'] = "Disturbing The Peace"
    report.attributes['crime_code'] = 187
コード例 #15
0
ファイル: new_yelp.py プロジェクト: jtalbott22/OpenCampusKent
import pytz
import sys, datetime
import dateutil.parser

logger = logging.getLogger('eb.retrieval.restaurant.reviews')
local_tz = pytz.timezone(settings.TIME_ZONE)

params = {'deals_filter' : False,
          'bounds' : " 41.0834917675, -81.39382852783203|41.206297513, -81.30878448486328",
          'limit' : '20'}

response = yelpAxes.search(params)


for yelpPost in response['businesses']:
        newsItem = NewsItem()
        newsItem.schema = schema
        newsItem.description = yelpPost['snippet_text']
        newsItem.rating = yelpPost['rating']

        newsItem.url = yelpPost['url']
        newsItem.title = yelpPost['name']
        newsItem.item_date = datetime.now()
        newsItem.pub_date = datetime.now()
        newsItem.location_name = 'Kent'
        newsItem.location = Point((float (yelpPost['location']['coordinate']['longitude']),
                        float (yelpPost['location']['coordinate']['latitude'])))
        newsItem.save()


class YelpScraper(object):
コード例 #16
0
def main(argv=None):
    logger.info("Starting add_news")
    if argv:
        url = argv[0]
    else:
        url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=massachusetts&scope=bonzai'
    schema_slug = 'local-news'

    try:
        schema = Schema.objects.get(slug=schema_slug)
    except Schema.DoesNotExist:
        logger.error("Schema (%s): DoesNotExist" % schema_slug)
        sys.exit(1)

    f = feedparser.parse(url)
    addcount = updatecount = 0
    for entry in f.entries:
        title = convert_entities(entry.title)
        description = convert_entities(entry.description)

        if entry.id.startswith('http'):
            item_url = entry.id
        else:
            item_url = entry.link
        try:
            item = NewsItem.objects.get(schema__id=schema.id,
                                        title=title,
                                        description=description)
            #url=item_url)
            status = 'updated'
        except NewsItem.DoesNotExist:
            item = NewsItem()
            status = 'added'
        except NewsItem.MultipleObjectsReturned:
            # Seen some where we get the same story with multiple URLs. Why?
            logger.warn(
                "Multiple entries matched title %r and description %r. Expected unique!"
                % (title, description))
            continue
        try:
            item.title = title
            item.schema = schema
            item.description = description
            item.url = item_url
            item.location_name = entry.get('x-calconnect-street') or entry.get(
                'georss_featurename')
            item.item_date = datetime.datetime(*entry.updated_parsed[:6])
            item.pub_date = datetime.datetime(*entry.updated_parsed[:6])

            # feedparser bug: depending on which parser it magically uses,
            # we either get the xml namespace in the key name, or we don't.
            point = entry.get('georss_point') or entry.get('point')
            if point:
                x, y = point.split(' ')
            else:
                # Fall back on geocoding.
                text = item.title + ' ' + item.description
                try:
                    x, y = quick_dirty_fallback_geocode(text, parse=True)
                except GeocodingException:
                    logger.debug("Geocoding exception on %r:" % text)
                    log_exception(level=logging.DEBUG)
                    continue
                if None in (x, y):
                    logger.info("couldn't geocode '%s...'" % item.title[:30])
                    continue
            item.location = Point((float(y), float(x)))
            if item.location.x == 0.0 and item.location.y == 0.0:
                # There's a lot of these. Maybe attempt to
                # parse and geocode if we haven't already?
                logger.info("Skipping %r as it has bad location 0,0" %
                            item.title)
                continue
            if not item.location_name:
                # Fall back to reverse-geocoding.
                from ebpub.geocoder import reverse
                try:
                    block, distance = reverse.reverse_geocode(item.location)
                    logger.debug(" Reverse-geocoded point to %r" %
                                 block.pretty_name)
                    item.location_name = block.pretty_name
                    item.block = block
                except reverse.ReverseGeocodeError:
                    logger.debug(" Failed to reverse geocode %s for %r" %
                                 (item.location.wkt, item.title))
                    item.location_name = u''
            item.save()
            if status == 'added':
                addcount += 1
            else:
                updatecount += 1
            logger.info("%s: %s" % (status, item.title))
        except:
            logger.error("Warning: couldn't save %r. Traceback:" % item.title)
            log_exception()
    logger.info("Finished add_news: %d added, %d updated" %
                (addcount, updatecount))
コード例 #17
0
ファイル: add_news.py プロジェクト: egrommet/openblock
def main(argv=None):
    if argv:
        url = argv[0]
    else:
        url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=massachusetts&scope=bonzai'
    schema = 'local-news'

    try:
        schema = Schema.objects.get(slug=schema)
    except Schema.DoesNotExist:
        print "Schema (%s): DoesNotExist" % schema
        sys.exit(1)

    f = feedparser.parse(url)

    for e in f.entries:
        try:
            item = NewsItem.objects.get(title=e.title, description=e.description)
            print "Already have %r (id %d)" % (item.title, item.id)
        except NewsItem.DoesNotExist:
            item = NewsItem()
        try:
            item.schema = schema
            item.title = convert_entities(e.title)
            item.description = convert_entities(e.description)
            item.url = e.link
            item.location_name = e.get('x-calconnect-street') or e.get('georss_featurename')
            item.item_date = datetime.datetime(*e.updated_parsed[:6])
            item.pub_date = datetime.datetime(*e.updated_parsed[:6])
            if 'point' in e:
                x,y = e.point.split(' ')
            elif 'georss_point' in e:
                x,y = e.georss_point.split(' ')
            else:
                text = item.title + ' ' + item.description
                from geocoder_hack import quick_dirty_fallback_geocode
                x, y = quick_dirty_fallback_geocode(text, parse=True)
                if None in (x, y):
                    print " couldn't geocode '%s...'" % item.title[:30]
                    continue
            item.location = Point((float(y), float(x)))
            if item.location.x == 0.0 and item.location.y == 0.0:
                # There's a lot of these. Maybe attempt to
                # parse and geocode if we haven't already?
                print "Skipping %r as it has bad location 0,0" % item.title
                continue
            if not item.location_name:
                # Fall back to reverse-geocoding.
                from ebpub.geocoder import reverse
                try:
                    block, distance = reverse.reverse_geocode(item.location)
                    print " Reverse-geocoded point to %r" % block.pretty_name
                    item.location_name = block.pretty_name
                except reverse.ReverseGeocodeError:
                    print " Failed to reverse geocode %s for %r" % (item.location.wkt, item.title)
                    item.location_name = u''
            item.save()
            print "Added: %s" % item.title
        except:
            print "Warning: couldn't save %r. Traceback:" % item.title
            import cStringIO, traceback
            f = cStringIO.StringIO()
            traceback.print_exc(file=f)
            msg = f.getvalue()
            print msg
コード例 #18
0
ファイル: add_events.py プロジェクト: vijayaraju/everyblock-1
def main():
    """ Download Calendar RSS feed and update database """

    url = """http://calendar.boston.com/search?acat=&cat=&commit=Search\
&new=n&rss=1&search=true&sort=0&srad=20&srss=50&ssrss=5&st=event\
&st_select=any&svt=text&swhat=&swhen=today&swhere=&trim=1"""
    schema = 'events'

    parser = OptionParser()
    parser.add_option('-q',
                      '--quiet',
                      action="store_true",
                      dest="quiet",
                      default=False,
                      help="no output")

    (options, args) = parser.parse_args()

    if len(args) > 0:
        return parser.error('script does not take any arguments')

    try:
        schema = Schema.objects.get(slug=schema)
    except Schema.DoesNotExist:
        print "Schema (%s): DoesNotExist" % schema
        sys.exit(0)

    feed = feedparser.parse(url)

    for entry in feed.entries:
        try:
            item = NewsItem.objects.get(title=entry.title,
                                        description=entry.description)
            status = "Updated"
        except NewsItem.DoesNotExist:
            item = NewsItem()
            status = "Added"

        try:
            item.schema = schema
            item.title = entry.title
            item.description = entry.description
            item.url = entry.link
            item.item_date = datetime.datetime(*entry.updated_parsed[:6])
            item.pub_date = datetime.datetime(*entry.updated_parsed[:6])
            item.location = Point(
                (float(entry['geo_long']), float(entry['geo_lat'])))
            item.save()
            if not options.quiet:
                print "%s: %s" % (status, item.title)
        except ValueError:
            if not options.quiet:
                print "unexpected error:", sys.exc_info()[1]
コード例 #19
0
ファイル: views.py プロジェクト: mesrut/openblock
def _create_item(request, schema, form):
    item = NewsItem(schema=schema)

    # common attributes
    for attr in ('title', 'description', 'location_name', 'url'): 
        setattr(item, attr, form.cleaned_data[attr])        
    
    
    # location 
    lon = form.cleaned_data['longitude']
    lat = form.cleaned_data['latitude']
    item.location = geos.Point(lon, lat)


    # maybe specified ...
    if 'item_date' in form.cleaned_data: 
        item.item_date = form.cleaned_data['item_date']
    else:
        item.item_date = datetime.datetime.now().date()
    item.pub_date = datetime.datetime.now()
    item.save()
    
    # 'categories'
    cats = [cat for cat in form.cleaned_data['categories'].split(',') if cat.strip()]
    if len(cats): 
        cat_field = SchemaField.objects.get(schema=schema, name='categories')
        lookups = set()
        for cat in cats:
            code = _category_code(cat)
            nice_name = _category_nice_name(cat)
            lu = Lookup.objects.get_or_create_lookup(cat_field, nice_name, code, "", False)
            lookups.add(lu.id)
        item.attributes['categories'] = ','.join(['%d' % luid for luid in lookups])
    
    # image link
    if form.cleaned_data['image_url']: 
        item.attributes['image_url'] = form.cleaned_data['image_url']
    

    item.save()

    # add a NewsItemCreator association
    # un-lazy the User.
    user = User.objects.get(id=request.user.id)
    creator = NewsItemCreator(news_item=item, user=user)
    creator.save()
    
    return item
コード例 #20
0
def update(url):
    logger.info("Scraping police reports")
    schema_slug = 'police'

    try:
        schema = Schema.objects.get(slug=schema_slug)
    except Schema.DoesNotExist:
        logger.error( "Schema (%s): DoesNotExist" % schema_slug)
        sys.exit(1)

    incident_type_field = SchemaField.objects.get(schema=schema, name='incident_type')

    f = feedparser.parse(url)
    addcount = updatecount = 0
    for entry in f.entries:
        title = convert_entities(entry.title).strip()
        # The title will be used as the incident type.
        if title in SKIP_TYPES:
            logger.info("Skipping entry of type %s" % title)
        description = convert_entities(entry.summary)
        try:
            item = NewsItem.objects.get(schema__id=schema.id,
                                        title=title,
                                        description=description)
            #url=item_url)
            status = 'updated'
        except NewsItem.DoesNotExist:
            item = NewsItem()
            status = 'added'
        except NewsItem.MultipleObjectsReturned:
            # Seen some where we get the same story with multiple URLs. Why?
            logger.warn("Multiple entries matched title %r and description %r. Expected unique!" % (title, description))
            continue
        try:
            item.title = title
            item.schema = schema
            item.description = description
            item.pub_date = datetime(*entry.updated_parsed[:6])
            item.location = Point((float(entry.geo_long), float(entry.geo_lat)))
            item.location_name = get_element(entry, 'address')

            # parse call time
            ct = datetime.strptime(get_element(entry, 'calldatetime'),
                                   r"%m/%d/%Y %I:%M:%S %p")
            #ct = datetime(ct.year, ct.month, ct.day, ct.hour, ct.minute, ct.second, tzinfo=tzlocal())
            #ct = ct.astimezone(tzutc())

            item.item_date = ct
            item.save()


            # extra attributes
            try:
                item.attributes['calldatetime'] = ct
            except: 
                pass

            try: 
                item.attributes['innum'] = int(get_element(entry, 'innum'))
            except: 
                pass
                
            for k in ['disp', 'aptlot', 'address']: 
                try: 
                    item.attributes[k] = get_element(entry, k)
                except: 
                    pass

            # create a lookup based on the title, this is the closest thing to 
            # a category that is available in the data.
            lu = Lookup.objects.get_or_create_lookup(incident_type_field, title, title, "", False)
            item.attributes['incident_type'] = lu.id


            if status == 'added':
                addcount += 1
            else:
                updatecount += 1
            logger.info("%s: %s" % (status, item.title))
        except:
            logger.error("Warning: couldn't save %r. Traceback: %s" % (item.title, traceback.format_exc()))
    logger.info("Finished scraping police reports: %d added, %d updated" % (addcount, updatecount))
コード例 #21
0
ファイル: add_events.py プロジェクト: DotNetWebs/openblock
def update():
    """ Download Calendar RSS feed and update database """
    logger.info("Starting add_events")
    url = """http://calendar.boston.com/search?acat=&cat=&commit=Search\
&new=n&rss=1&search=true&sort=0&srad=20&srss=50&ssrss=5&st=event\
&st_select=any&svt=text&swhat=&swhen=today&swhere=&trim=1"""
    schema = 'events'


    try:
        schema = Schema.objects.get(slug=schema)
    except Schema.DoesNotExist:
        logger.error("Schema (%s): DoesNotExist" % schema)
        sys.exit(1)

    feed = feedparser.parse(url)
    addcount = updatecount = 0
    for entry in feed.entries:
        title = convert_entities(entry.title).strip()
        # Putting 'event' in the title is redundant, ticket #227
        if title.lower().startswith('event: '):
            title = title[7:]
        try:
            item = NewsItem.objects.get(title=title,
                                        schema__id=schema.id)
            status = "updated"
        except NewsItem.DoesNotExist:
            item = NewsItem()
            status = "added"
        except NewsItem.MultipleObjectsReturned:
            logger.warn("Multiple entries matched title %r, event titles are not unique?" % title)
            continue
        try:
            item.location_name = entry.get('xcal_x-calconnect-street') or entry.get('x-calconnect-street') or u''
            item.schema = schema
            item.title = title
            item.description = convert_entities(entry.description)
            item.url = entry.link
            item.item_date = datetime.datetime(*entry.updated_parsed[:6])
            item.pub_date = datetime.datetime(*entry.updated_parsed[:6])
            item.location = Point((float(entry['geo_long']),
                                   float(entry['geo_lat'])))
            if (item.location.x, item.location.y) == (0.0, 0.0):
                logger.warn("Skipping %r, bad location 0,0" % item.title)
                continue

            if not item.location_name:
                # Fall back to reverse-geocoding.
                from ebpub.geocoder import reverse
                try:
                    block, distance = reverse.reverse_geocode(item.location)
                    logger.info(" Reverse-geocoded point to %r" % block.pretty_name)
                    item.location_name = block.pretty_name
                except reverse.ReverseGeocodeError:
                    logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title))
                    item.location_name = u''

            item.save()
            if status == 'added':
                addcount += 1
            else:
                updatecount += 1
            logger.info("%s: %s" % (status, item.title))
        except:
            logger.exception("unexpected error:", sys.exc_info()[1])

    logger.info("add_events finished: %d added, %d updated" % (addcount, updatecount))
コード例 #22
0
        def update(self, searchTerm, searchOffset):
                youtubeAPI = YouTubeAPI()
                numentries = 50 #How many results do we want the API to return
                logger.info("Starting YouTube_Scraper")
                response = youtubeAPI.runQuery(searchTerm, numentries, searchOffset)
                seencount = addcount = updatecount = 0
                if response:
                                for entry in response:
                                                seencount += 1
                                                count = 0
                                                while count != 9:
							if 'ns'+ str(count) + ':title' in entry:
								if entry['ns'+ str(count) + ':title'] != '':
									title = entry['ns'+ str(count) + ':title']
									count += 1
                                                        	else:
									logger.info("Skipping, as title is empty.")
									continue
							else:
								count += 1
                                                try:
                                                        newsItem = NewsItem.objects.get(title=title,schema__id=self.schema.id)
                                                        status = "updated"
                                                except NewsItem.DoesNotExist:
                                                        newsItem = NewsItem()
                                                        status = "added"
                                                except NewsItem.MultipleObjectsReturned:
                                                        logger.warn("Multiple entries matched title %r, event titles are not unique?" % title)
                                                        continue
                                                try:
                                                        newsItem.schema = self.schema
                                                        count = 0
                                                        while count != 9:
                                                                if 'ns'+ str(count) + ':description' in entry:
                                                                        if entry['ns'+ str(count) + ':description'] != '':
                                                                                newsItem.description = entry['ns'+ str(count) + ':description']
                                                                                break
                                                                        else:
                                                                              logger.info("Skipping %r as description is empty." % (title))
                                                                              continue
                                                                else:
                                                                        count += 1
                                                        newsItem.url = entry['ns0:link']
                                                        count = 0
                                                        while count != 9:
                                                                if 'ns'+ str(count) + ':title' in entry:
                                                                        if entry['ns'+ str(count) + ':title'] != '':
                                                                                newsItem.title = entry['ns'+ str(count) + ':title']
                                                                                count += 1
                                                                        else:
                                                                                logger.info("Skipping, as title is empty.")
                                                                                continue
                                                                else:
                                                                        count += 1
                                                        # newsItem.item_date = datetime.datetime.now()
                                                        count = 0
                                                        while count != 9:
                                                                if 'ns'+ str(count) + ':published' in entry:
                                                                        yt_timedate = string.split(entry['ns'+ str(count) + ':published'],'T')
                                                                        break
                                                                else:
                                                                        count += 1
                                                        date = yt_timedate[0]
                                                        time = string.split(yt_timedate[1],'Z')
                                                        formatted = date
							#date + " " + time[0] + "000" #Used to include timestamps
                                                        newsItem.pub_date = datetime.datetime.now()
                                                        newsItem.item_date = formatted.encode( "utf-8" )
							_short_title = newsItem.title[:30] + '...'
                                                        #newsItem.location_name = 'Kent'
                                                        count = 0
                                                        while count != 9:
                                                                if 'ns'+ str(count) + ':pos' in entry:
                                                                        long_lat = string.split(entry['ns'+ str(count) + ':pos'])
                                                                        break
                                                                else:
                                                                        count += 1
                                                        newsItem.location = Point(float(long_lat[1]),float(long_lat[0]))
							x, y = float(long_lat[0]), float(long_lat[1])
                                                        if not intersects_metro_bbox(newsItem.location):
                                                                reversed_loc = Point((float(y), float(x)))
                                                                if intersects_metro_bbox(reversed_loc):
                                                                        logger.info(
                                                                                "Got points in apparently reverse order, flipping them")
                                                                        newsItem.location = reversed_loc
                                                                else:
                                                                        logger.info("Skipping %r as %s,%s is out of bounds" % (_short_title, y, x))
                                                                        continue
                                                        if not newsItem.location_name:
                                                                # Fall back to reverse-geocoding.
                                                                from ebpub.geocoder import reverse
                                                                try:
                                                                        block, distance = reverse.reverse_geocode(newsItem.location)
                                                                        logger.debug(" Reverse-geocoded point to %r" % block.pretty_name)
                                                                        newsItem.location_name = block.pretty_name
                                                                        newsItem.block = block
                                                                except reverse.ReverseGeocodeError:
                                                                        logger.info(" Skip, failed to reverse geocode %s for %r" % (newsItem.location.wkt, _short_title))
                                                                        continue

                                                        attributes_ = {}
                                                        attributes_['photo_href'] = entry['ns0:thumb']
                                                        attributes_['videoID'] = entry['ns0:video_id']
							attributes_['searchTerm'] = searchTerm
    
                                                        newsItem.save()
                                                        newsItem.attributes = attributes_
                                                        newsItem.save()
                                                        
                                                        if status == 'added':
                                                                addcount += 1
                                                        else:
                                                                updatecount += 1
                                                        logger.info("%s: %s" % (status, newsItem.title))
                                                except Exception as e:
                                                        logger.exception("unexpected error: %s" % e)
                logger.info("YouTube_Scraper finished: %d added, %d updated of %s total" % (addcount, updatecount, seencount))
コード例 #23
0
def update(url):
    schema_slug = 'sheriff'
    try:
        schema = Schema.objects.get(slug=schema_slug)
    except Schema.DoesNotExist:
        logger.error( "Schema (%s): DoesNotExist" % schema_slug)
        sys.exit(1)

    incident_type_field = SchemaField.objects.get(schema=schema, name='incident_type')


    try: 
        innum_field = SchemaField.objects.get(schema=schema, name='innum')
    except SchemaField.DoesNotExist: 
        logger.error( "SchemaField innum Does Not Exist for %s" % schema_slug)
        sys.exit(1)

    logger.info("Scraping %s" % schema.name)


    f = feedparser.parse(url)
    addcount = updatecount = 0
    for entry in f.entries:
        
        innum = int(get_element(entry, 'innum'))
        title = convert_entities(entry.title)
        description = convert_entities(entry.summary)

        try:
            item = NewsItem.objects.filter(schema=schema).by_attribute(innum_field, innum)[0]
            #url=item_url)
            status = 'updated'
        except IndexError:
            item = NewsItem()
            status = 'added'

        try:
            item.title = title
            item.schema = schema
            item.description = description

            try: 
                item.location = Point((float(entry.geo_long), float(entry.geo_lat)))
            except: 
                logger.info("Skipping item %s with no location information" % innum)

            item.location_name = get_element(entry, 'address')


            # this feed uses an invalidly formatted pubDate which 
            # appears to be intended to express the time of the 
            # incident, used for publication time as well.
            # 24 hour time. 
            ct = datetime.strptime(entry.updated, r"%m/%d/%Y %H:%M:%S")
            #ct = datetime(ct.year, ct.month, ct.day, ct.hour, ct.minute, ct.second, tzinfo=tzlocal())
            #ct = ct.astimezone(tzutc())

            item.item_date = ct
            item.pub_date = ct.date()
            item.save()

            # extra attributes
            item.attributes['innum'] = innum

            for k in ['address']: 
                try:
                    item.attributes[k] = get_element(entry, k)
                except:
                    pass

            # create a lookup based on the title, this is the closest thing to 
            # a category that is available in the data.
            lu = Lookup.objects.get_or_create_lookup(incident_type_field, title, title, "", False)
            item.attributes['incident_type'] = lu.id


            if status == 'added':
                addcount += 1
            else:
                updatecount += 1
            logger.info("%s: %s" % (status, item.title))
        except:
            logger.error("Warning: couldn't save %r. Traceback: %s" % (item.title, traceback.format_exc()))

    
    logger.info("Finished scraping %s: %d added, %d updated" % (schema.name, addcount, updatecount))
コード例 #24
0
ファイル: random_news.py プロジェクト: christaggart/openblock
def save_random_newsitem(schema, i, block):
    title = '%d Random %s %s' % (i, schema.name, uuid.uuid4())
    print "Creating %r" % title
    # XXX datamodel spike hack - switch to using the subclass instance
    for item_factory in (RestaurantInspection, SeeclickfixIssue):
        if item_factory.schemaslug == schema.slug:
            item = item_factory()
            break
    else:
        item = NewsItem()
    item.title = title
    item.schema = schema
    item.description = gibberis.ch.freeform.random_text(get_text_corpus(), 300)
    item.url = 'http://example.com/%s/%d' % (schema.slug, i)
    date = random_datetime(7.0)
    item.pub_date = date
    item.item_date = date.date()
    item.location_name = block.pretty_name
    item.block = block
    try:
        item.location = block.geom.centroid
    except AttributeError:
        item.location = block.geom
    # Populate the attributes.

    attrs = {}
    for schemafield in schema.schemafield_set.all():
        attrs[schemafield.name] = random_schemafield_value(schemafield)

    print "Added: %s at %s (%s)" % (item.title, item.location_name, item.location.wkt)

    # Need to save before we can have foreign keys from the attributes
    # or subclass.
    item.save()
    if attrs:
        item.attributes = attrs
        # That implicitly saves in the old model, but not the new.
        item.save()
コード例 #25
0
    def update(self):
        logger.info("Starting LocalNewsScraper update %s" % self.url)

        try:
            schema = Schema.objects.get(slug=self.schema_slug)
        except Schema.DoesNotExist:
            logger.error("Schema (%s): DoesNotExist" % self.schema_slug)
            return 1

        response, content = self.http.request(self.url)
        if response.fromcache:
            logger.info("Feed is unchanged since last update (cached)")
            return

        f = feedparser.parse(content)
        addcount = updatecount = 0
        for entry in f.entries:
            title = convert_entities(entry.title)
            description = convert_entities(entry.description)

            if entry.id.startswith('http'):
                item_url = entry.id
            else:
                item_url = entry.link
            try:
                item = NewsItem.objects.get(schema__id=schema.id,
                                            title=title,
                                            description=description)
                #url=item_url)
                status = 'updated'
            except NewsItem.DoesNotExist:
                item = NewsItem()
                status = 'added'
            except NewsItem.MultipleObjectsReturned:
                # Seen some where we get the same story with multiple URLs. Why?
                logger.warn(
                    "Multiple entries matched title %r and description %r. Expected unique!"
                    % (title, description))
                continue
            try:
                item.title = title
                item.schema = schema
                item.description = description
                item.url = item_url
                # Support both georss and xcal for getting the location name.
                # TODO: should also support ev:location per http://web.resource.org/rss/1.0/modules/event/
                item.location_name = entry.get(
                    'xCal_x-calconnect-street') or entry.get(
                        'x-calconnect-street') or entry.get(
                            'georss_featurename') or entry.get('featurename')
                item.item_date = datetime.datetime(*entry.updated_parsed[:6])
                item.pub_date = datetime.datetime(*entry.updated_parsed[:6])
                _short_title = item.title[:30] + '...'

                # feedparser bug: depending on which parser it magically uses,
                # we either get the xml namespace in the key name, or we don't.
                point = entry.get('georss_point') or entry.get('point')
                x, y = None, None
                if point:
                    # GeoRSS puts latitude (Y) first.
                    y, x = point.split(' ')
                else:
                    if item.location_name:
                        text = item.location_name
                    else:
                        # Geocode whatever we can find.
                        text = item.title + ' ' + item.description
                    logger.debug("...Falling back on geocoding from %r..." %
                                 text[:50])
                    addrs = parse_addresses(text)
                    for addr, unused in addrs:
                        try:
                            result = SmartGeocoder().geocode(addr)
                            point = result['point']
                            logger.debug("internally geocoded %r" % addr)
                            x, y = point.x, point.y
                            if not item.location_name:
                                item.location_name = result['address']
                            break
                        except GeocodingException:
                            logger.debug("Geocoding exception on %r:" % text)
                            log_exception(level=logging.DEBUG)
                            continue
                        except:
                            logger.error(
                                'uncaught geocoder exception on %r\n' % addr)
                            log_exception()
                    if None in (x, y):
                        logger.debug(
                            "Skip, couldn't geocode any addresses in item '%s...'"
                            % _short_title)
                        continue
                item.location = Point((float(x), float(y)))
                if not intersects_metro_bbox(item.location):
                    reversed_loc = Point((float(y), float(x)))
                    if intersects_metro_bbox(reversed_loc):
                        logger.info(
                            "Got points in apparently reverse order, flipping them"
                        )
                        item.location = reversed_loc
                    else:
                        logger.info("Skipping %r as %s,%s is out of bounds" %
                                    (_short_title, y, x))
                        continue
                if not item.location_name:
                    # Fall back to reverse-geocoding.
                    from ebpub.geocoder import reverse
                    try:
                        block, distance = reverse.reverse_geocode(
                            item.location)
                        logger.debug(" Reverse-geocoded point to %r" %
                                     block.pretty_name)
                        item.location_name = block.pretty_name
                    except reverse.ReverseGeocodeError:
                        logger.info(
                            " Skip, failed to reverse geocode %s for %r" %
                            (item.location.wkt, _short_title))
                        continue
                item.save()
                if status == 'added':
                    addcount += 1
                else:
                    updatecount += 1
                logger.info("%s: %s" % (status, _short_title))
            except:
                logger.error("Warning: couldn't save %r. Traceback:" %
                             _short_title)
                log_exception()
        logger.info("Finished LocalNewsScraper update: %d added, %d updated" %
                    (addcount, updatecount))
コード例 #26
0
def update(xmlfile, options):
    logger.info("Scraping University of Missouri police reports")

    if options.days == -1:
        start_date = datetime.date(1970, 1, 1)
    else:
        start_date = datetime.date.today() - datetime.timedelta(days=options.days)

    schema_slug = 'mupd'
    try:
        schema = Schema.objects.get(slug=schema_slug)
    except Schema.DoesNotExist:
        logger.error( "Schema (%s): DoesNotExist" % schema_slug)
        sys.exit(1)

    # We use iterparse() to avoid keeping the whole xml tree in memory,
    # this is a pretty big file.
    # See http://effbot.org/zone/element-iterparse.htm
    context = iter(lxml.etree.iterparse(xmlfile, events=('start', 'end')))
    addcount = updatecount = 0
    event, root = context.next()
    for event, elem in context:
        if event == 'end' and elem.tag == 'Table':
            category = cleanup(elem.findtext('Description'))
            lat = cleanup(elem.findtext('Lat'))
            lon = cleanup(elem.findtext('Lon'))
            item_date = cleanup(elem.findtext('CreateDatetime'))
            house_number = cleanup(elem.findtext('HouseNumber'))
            prefix = cleanup(elem.findtext('StreetPrefix'))
            street = cleanup(elem.findtext('StreetName'))
            streettype = cleanup(elem.findtext('StreetType'))
            suffix = cleanup(elem.findtext('StreetSuffix'))
            incident_number = cleanup(elem.findtext('IncidentNumber'))
            # We're done with this <Table> tag; clear the root element
            # that iterparse is building to avoid bloating memory with
            # empty elements.
            root.clear()
        else:
            continue

        if item_date:
            item_date = pyrfc3339.parse(item_date)
            if item_date.date() < start_date:
                logger.debug("Date %s is older than start date, skipping." % item_date)
                continue
        else:
            logger.debug("No parsable date, skipping.")
            continue

        location_parts = [house_number, prefix, street, streettype, suffix]
        location_name = ' '.join([s for s in location_parts if s])
        if location_name:
            title = '%s: %s' % (location_name.title(), category.title())
        else:
            title = category.title()

        try:
            lon, lat = float(lon), float(lat)
            location = Point(lon, lat)
        except ValueError:
            location = None

        if location and not intersects_metro_bbox(location):
            logger.info("SKIP %s (at %s), not within our metro area"
                        % (title, (location.x, location.y)))
            continue

        cat_field = SchemaField.objects.get(schema=schema, name='category')
        cat_lookup = Lookup.objects.get_or_create_lookup(
            cat_field, category, category, "", False)

        attributes = {'incident_number': incident_number,
                      'category': cat_lookup.id}

        incident_number_field = SchemaField.objects.get(schema=schema,
                                                        name='incident_number')
        try:
            item = NewsItem.objects.filter(schema__id=schema.id).by_attribute(incident_number_field, incident_number)[0]
            status = 'updated'
        except IndexError:
            item = NewsItem(pub_date=datetime.datetime.now())
            status = 'added'
        except NewsItem.MultipleObjectsReturned:
            logger.warn("Multiple entries matched incident_number %s" % incident_number)
            continue
        logger.debug("%s %s" % (status, incident_number))
        try:
            item.title = title
            item.schema = schema
            item.item_date = item_date.date()
            item.description = title # We don't have anything more verbose!
            item.location = location
            item.location_name = location_name
            item.save()
            item.attributes = attributes
            if status == 'added':
                addcount += 1
            else:
                updatecount += 1
            logger.info("%s: %s" % (status, item.title))
        except:
            logger.error("Warning: couldn't save %r. Traceback: %s" % (item.title, traceback.format_exc()))
    logger.info("Finished scraping police reports: %d added, %d updated" % (addcount, updatecount))
コード例 #27
0
ファイル: add_events.py プロジェクト: slinkp/openblock
def update():
    """ Download Calendar RSS feed and update database """
    logger.info("Starting add_events")
    url = """http://calendar.boston.com/search?acat=&cat=&commit=Search\
&new=n&rss=1&search=true&sort=0&srad=20&srss=50&ssrss=5&st=event\
&st_select=any&svt=text&swhat=&swhen=today&swhere=&trim=1"""
    schema = 'events'

    try:
        schema = Schema.objects.get(slug=schema)
    except Schema.DoesNotExist:
        logger.error("Schema (%s): DoesNotExist" % schema)
        sys.exit(1)

    feed = feedparser.parse(url)
    addcount = updatecount = 0
    for entry in feed.entries:
        title = convert_entities(entry.title).strip()
        # Putting 'event' in the title is redundant, ticket #227
        if title.lower().startswith('event: '):
            title = title[7:]
        try:
            item = NewsItem.objects.get(title=title, schema__id=schema.id)
            status = "updated"
        except NewsItem.DoesNotExist:
            item = NewsItem()
            status = "added"
        except NewsItem.MultipleObjectsReturned:
            logger.warn(
                "Multiple entries matched title %r, event titles are not unique?"
                % title)
            continue
        try:
            item.location_name = entry.get(
                'xcal_x-calconnect-street') or entry.get(
                    'x-calconnect-street') or u''
            item.schema = schema
            item.title = title
            item.description = convert_entities(entry.description)
            item.url = entry.link
            item.item_date = datetime.datetime(*entry.updated_parsed[:6])
            item.pub_date = datetime.datetime(*entry.updated_parsed[:6])
            item.location = Point(
                (float(entry['geo_long']), float(entry['geo_lat'])))
            if (item.location.x, item.location.y) == (0.0, 0.0):
                logger.warn("Skipping %r, bad location 0,0" % item.title)
                continue

            if not item.location_name:
                # Fall back to reverse-geocoding.
                from ebpub.geocoder import reverse
                try:
                    block, distance = reverse.reverse_geocode(item.location)
                    logger.info(" Reverse-geocoded point to %r" %
                                block.pretty_name)
                    item.location_name = block.pretty_name
                except reverse.ReverseGeocodeError:
                    logger.debug(" Failed to reverse geocode %s for %r" %
                                 (item.location.wkt, item.title))
                    item.location_name = u''

            item.save()
            if status == 'added':
                addcount += 1
            else:
                updatecount += 1
            logger.info("%s: %s" % (status, item.title))
        except:
            logger.exception("unexpected error:", sys.exc_info()[1])

    logger.info("add_events finished: %d added, %d updated" %
                (addcount, updatecount))
コード例 #28
0
ファイル: add_news.py プロジェクト: vijayaraju/everyblock-1
def main(argv=None):
    url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=boston&scope=bonzai'
    schema = 'local-news'

    try:
        schema = Schema.objects.get(slug=schema)
    except Schema.DoesNotExist:
        print "Schema (%s): DoesNotExist" % schema
        sys.exit(0)

    f = feedparser.parse(url)
    geocoder = SmartGeocoder()

    for e in f.entries:
        try:
            item = NewsItem.objects.get(title=e.title,
                                        description=e.description)
        except NewsItem.DoesNotExist:
            item = NewsItem()
            item.schema = schema
            item.title = e.title
            item.description = e.description
            item.url = e.link
            #item.location_name = e['x-calconnect-street']
            item.item_date = datetime.datetime(*e.updated_parsed[:6])
            item.pub_date = datetime.datetime(*e.updated_parsed[:6])

            try:
                if 'point' in e:
                    x, y = e.point.split(' ')
                else:
                    x, y = e.georss_point.split(' ')
                item.location = Point((float(y), float(x)))
                item.save()
            except:
                pass

            print "Added: %s" % item.title
コード例 #29
0
	def update(self):

		#
		#
		# Download Calendar RSS feed and update database
		#
		#

		logger.info("Starting KSUStudentProgrammingScraper")

		feed = feedparser.parse(self.url)
		seencount = addcount = updatecount = 0
		for entry in feed.entries:

			seencount += 1
			title = convert_entities(entry.title)
			title = foo(title, '', ' (')
			try:
				item = NewsItem.objects.get(title=title,
											schema__id=self.schema.id)
				status = "updated"
			except NewsItem.DoesNotExist:
				item = NewsItem()
				status = "added"
			except NewsItem.MultipleObjectsReturned:
				logger.warn("Multiple entries matched title %r, event titles are not unique?" % title)
				continue
			try:

				#
				#
				# The actual rss feed elements are grabbed here
				#
				#

				itm_description = entry.description

				soup = BeautifulSoup(foo(itm_description,"</table><br />","<br /><br />"))
				locations = soup.findAll(text=True)
				location = locations[0].strip()
				place_grabber = places.place_grabber()
				grab_results = place_grabber(location)
				try:
					item.location = Place.objects.get(pretty_name=grab_results[0][2]).location
					item.location_name = Place.objects.get(pretty_name=grab_results[0][2]).pretty_name
				except:
					item.location = PlaceSynonym.objects.get(pretty_name=grab_results[0][2]).place.location
					item.location_name = PlaceSynonym.objects.get(pretty_name=grab_results[0][2]).place.pretty_name

				try:
					item.attributes['room'] = locations[1].strip().replace("Room: ","")
				except Exception as e:
					logger.info("Tried saving item.room, error: %s" % e)

				item.schema = self.schema
				item.title = title

				soup = BeautifulSoup(foo(itm_description,"<br /><br />","</td></tr>"))
				item.description = soup.findAll(text=True)
				item.description = item.description[0].strip()

				item.url = entry.link

				start_t = foo(itm_description,"Start Time:</b>&nbsp;</td><td>","</td>")
				start_t = dateutil.parser.parse(start_t)

				end_t = foo(itm_description,"End Time:</b>&nbsp;</td><td>","</td>")
				end_t = dateutil.parser.parse(end_t)

				end_dt = foo(itm_description,"End Date:</b>&nbsp;</td><td>","</td>")
				end_dt = dateutil.parser.parse(end_dt)

				item.item_date = dateutil.parser.parse(entry.category)
				item.pub_date = datetime.datetime(*entry.updated_parsed[:6])

				item.attributes['start-time'] = start_t.time()
				item.attributes['end-time'] = end_t.time()

				item.save()

				if status == 'added':
					addcount += 1
				else:
					updatecount += 1
				logger.info("%s: %s" % (status, item.title))
			except Exception as e:
				logger.exception("unexpected error: %s" % e)
		logger.info("KSUStudentProgrammingScraper finished: %d added, %d updated of %s total" % (addcount, updatecount, seencount))
コード例 #30
0
ファイル: add_news.py プロジェクト: horshacktest/openblock
def update(argv=None):
    logger.info("Starting add_news")
    if argv:
        url = argv[0]
    else:
        url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=massachusetts&scope=bonzai'
    schema_slug = 'local-news'

    try:
        schema = Schema.objects.get(slug=schema_slug)
    except Schema.DoesNotExist:
        logger.error( "Schema (%s): DoesNotExist" % schema_slug)
        sys.exit(1)

    f = feedparser.parse(url)
    addcount = updatecount = 0
    for entry in f.entries:
        title = convert_entities(entry.title)
        description = convert_entities(entry.description)

        if entry.id.startswith('http'):
            item_url = entry.id
        else:
            item_url = entry.link
        try:
            item = NewsItem.objects.get(schema__id=schema.id,
                                        title=title,
                                        description=description)
            #url=item_url)
            status = 'updated'
        except NewsItem.DoesNotExist:
            item = NewsItem()
            status = 'added'
        except NewsItem.MultipleObjectsReturned:
            # Seen some where we get the same story with multiple URLs. Why?
            logger.warn("Multiple entries matched title %r and description %r. Expected unique!" % (title, description))
            continue
        try:
            item.title = title
            item.schema = schema
            item.description = description
            item.url = item_url
            item.location_name = entry.get('x-calconnect-street') or entry.get('georss_featurename')
            item.item_date = datetime.datetime(*entry.updated_parsed[:6])
            item.pub_date = datetime.datetime(*entry.updated_parsed[:6])

            # feedparser bug: depending on which parser it magically uses,
            # we either get the xml namespace in the key name, or we don't.
            point = entry.get('georss_point') or entry.get('point')
            x, y = None, None
            if point:
                x, y = point.split(' ')
            if True:
                # Fall back on geocoding.
                text = item.title + ' ' + item.description
                addrs = parse_addresses(text)
                for addr, unused in addrs:
                    try:
                        result = SmartGeocoder().geocode(addr)
                        point = result['point']
                        logger.debug("internally geocoded %r" % addr)
                        x, y = point.x, point.y
                        break
                    except (GeocodingException, ParsingError):
                        logger.debug("Geocoding exception on %r:" % text,
                                     exc_info=True)
                        continue
                    except:
                        logger.exception('uncaught geocoder exception on %r\n' % addr)
                if None in (x, y):
                    logger.info("couldn't geocode '%s...'" % item.title[:30])
                    continue
            item.location = Point((float(y), float(x)))
            if item.location.x == 0.0 and item.location.y == 0.0:
                # There's a lot of these. Maybe attempt to
                # parse and geocode if we haven't already?
                logger.info("Skipping %r as it has bad location 0,0" % item.title)
                continue
            if not item.location_name:
                # Fall back to reverse-geocoding.
                from ebpub.geocoder import reverse
                try:
                    block, distance = reverse.reverse_geocode(item.location)
                    logger.debug(" Reverse-geocoded point to %r" % block.pretty_name)
                    item.location_name = block.pretty_name
                except reverse.ReverseGeocodeError:
                    logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title))
                    item.location_name = u''
            item.save()
            if status == 'added':
                addcount += 1
            else:
                updatecount += 1
            logger.info("%s: %s" % (status, item.title))
        except:
            logger.exception("Warning: couldn't save %r. Traceback:" % item.title)

    logger.info("Finished add_news: %d added, %d updated" % (addcount, updatecount))
コード例 #31
0
	def update(self):

		#
		#
		# Grab the Twitter feeds and start saving
		#
		#

		logger.info("Starting Twitter Scraper")
		response = self.search_twitter(self.hashtag)

		seencount = addcount = updatecount = 0
		for entry in response['results']:
			seencount += 1
			title = entry['text'].replace('RT ','')
			try:
				item = NewsItem.objects.get(title=title,
											schema__id=self.schema.id)
				status = "updated"
				continue
			except NewsItem.DoesNotExist:
				item = NewsItem()
				status = "added"
			except NewsItem.MultipleObjectsReturned:
				logger.warn("Multiple entries matched title %r, event titles are not unique?" % title)
				continue
			try:

				#
				#
				# The actual Twitter return results are grabbed here
				#
				#

				if any(entry['from_user'] in s for s in self.allowed_users) :
					#item.location_name = entry['location']
					# print entry['entities']['hashtags']
					# print entry['location']
					item.schema = self.schema
					item.title = title
					item.description = entry['text'].replace('RT ','')
					item.location_name = 'student center'

					try :
						item.url = ("https://twitter.com/#!/%s/status/%s" % (entry['from_user'], entry['id_str']))
					except :
						print "No url"

					rg = re.compile(self.re1,re.IGNORECASE|re.DOTALL)
					m = rg.search(item.description)
					if m:
						mmddyy1=m.group(1)
						print "("+mmddyy1+")"+"\n"
					else :
						mmddyy1 = entry['created_at']

					item.item_date = dateutil.parser.parse(mmddyy1)
					item.pub_date = datetime.datetime.now()

					item.save()
					item.attributes = {'photo_href' : entry['profile_image_url'], 'author' : entry['entities']['user_mentions'][0]['screen_name'] }
					item.save()

					if status == 'added':
						addcount += 1
					else:
						updatecount += 1
					logger.info("%s: %s" % (status, item.title))
			except Exception as e:
				logger.exception("unexpected error: %s" % e)
		logger.info("TwitterScraper finished: %d added, %d updated of %s total" % (addcount, updatecount, seencount))