def handle(self, *args, **options): print "Scraping" now = datetime.now() url = 'http://www.o2academybrixton.co.uk/?t=list' html = urllib.urlopen(url) page = BeautifulSoup.BeautifulSoup(html) rows = page.find('div', {'class': 'eventViewList'}).findAll('tr') for row in rows: #date and time datetime_td = row.find('td', {'class': 'eventViewListDate'}) if datetime_td: datetime_scraped = '' for div in datetime_td.findAll('div'): datetime_scraped = datetime_scraped + ' ' + div.text event_datetime = datetime.strptime(datetime_scraped, " %d %b '%y-%a %I.%M%p" ) # is today and not a Sunday? if (event_datetime - datetime.now()).days == 0 and datetime.now().weekday() != 6: #artist artist_td = row.find('td', {'class': 'eventViewListName'}) if artist_td: artist_name = artist_td.find('a',{'class': 'main'}).text url = artist_td.find('a',{'class': 'main'})['href'] #guid is artist, month year. i.e. dont save multiple night runs guid = artist_name.replace(' ', '') + event_datetime.strftime('%m%y') #Is sold out? links_td = row.find('td', {'class': 'eventViewListLinks'}) status_div = links_td.find('div', {'class': 'icon'}) if status_div and status_div.text == 'Sold Out': existing_events = models.Event.objects.filter(guid=guid) if len(existing_events) == 0: event = models.Event() event.message = 'The academy sold out tonight: %s' % artist_name event.event_type = models.EventType.objects.get(short_name='academy') event.info_link = url event.guid = guid event.occured = event_datetime event.address = "Brixton Academy, 211 Stockwell Road, SW9 9SL" event.lng = -0.11497 event.lat = 51.46526 event.data = {'artist_name': artist_name} event.save() print "saved %s " % event.message
def handle(self, *args, **options): print "Getting JSON" now = datetime.now() data = json.load( urllib.urlopen( 'http://ratings.food.gov.uk/enhanced-search/en-GB/%5E/sw9/desc_rating/0/522/%5E/1/1/500/json' )) if data and data['FHRSEstablishment'].get('EstablishmentCollection'): for venue in data['FHRSEstablishment']['EstablishmentCollection'][ 'EstablishmentDetail']: if venue['RatingDate'].find(now.strftime('%Y')) > -1: rating_date = datetime.strptime(venue['RatingDate'], '%Y-%m-%d') days_since_rating = (now - rating_date).days latlng = { 'lat': float(venue['Geocode']['Latitude']), 'lng': float(venue['Geocode']['Longitude']) } if geo.is_local(latlng): #has it been added before? venue_url = 'http://ratings.food.gov.uk/business/en-GB/%s' % venue[ 'FHRSID'] guid = venue_url + venue['RatingDate'] existing_events = models.Event.objects.filter( guid=guid) if len(existing_events) == 0: event = models.Event() event.message = "%s was inspected for food hygiene. It got %s out of 5." % ( venue['BusinessName'], venue['RatingValue']) event.event_type = models.EventType.objects.get( short_name='foodratings') event.info_link = venue_url event.guid = guid event.occured = rating_date event.address = "%s, %s %s" % ( venue['AddressLine1'], venue['AddressLine2'], venue['PostCode']) event.lng = latlng['lng'] event.lat = latlng['lat'] event.data = { 'business_name': venue['BusinessName'], 'business_type': venue['BusinessType'], 'rating_value': int(venue['RatingValue']) } event.save() print "saved %s " % event.message
def handle(self, *args, **options): url = 'http://api.wikilocation.org/articles?lat=%s&lng=%s&limit=25&format=json' % ( '51.46238', '-0.1145') data = json.load(urllib.urlopen(url)) for article in data['articles']: id = article['id'] title = article['title'] #check if local if geo.is_local({ 'lat': float(article['lat']), 'lng': float(article['lng']) }): #rss history feed_url = 'http://en.wikipedia.org/w/index.php?curid=%s&action=history&feed=atom' % id rss = feedparser.parse(feed_url) for entry in rss.entries: datetime_updated = dateutil.parser.parse(entry['updated']) split = entry['title'].split(':') change_title = split[len(split) - 1] message = "The Wikipedia article about %s was edited - %s" % ( title, change_title) save = False #check if recent (last few days) yesterday = date.today() - timedelta(3) save = (datetime_updated.date() > yesterday) if save: existing_events = models.Event.objects.filter( guid=entry['links'][0]['href']) if len(existing_events) == 0: print "save" event = models.Event() event.message = message event.event_type = models.EventType.objects.get( short_name='wikipedia') event.info_link = entry['links'][0]['href'] event.guid = entry['links'][0]['href'] event.address = '' event.lng = article['lng'] event.lat = article['lat'] event.data = article event.save()
def handle(self, *args, **options): print "Starting to scrape" html = urllib.urlopen( 'http://www.lambeth.gov.uk/Services/Business/LicencesStreetTrading/AlcoholEntertainmentLateNightRefreshment/CurrentApplications.htm' ) page = BeautifulSoup.BeautifulSoup(html) #find rows for infobox in page.findAll('div', {'class': 'infoBox'}): for list_item in infobox.findAll('li'): address = list_item.find('a').string details = list_item.contents[4].split( 'last date for representations')[0].rstrip(', ').rstrip( ' - ') application_pdf_link = 'http://www.lambeth.gov.uk/' + list_item.find( 'a')['href'] postcode = geo.extract_gb_postcode(address) latlng = geo.postcode_latlng(postcode) existing_events = models.Event.objects.filter( guid=application_pdf_link) if len(existing_events) == 0: if geo.is_local(latlng): applicant = address.split(',')[0] application_type = details.replace( 'Application for ', '') message = "%s %s " % (applicant, application_type) event = models.Event() event.message = self.humanize(message) event.event_type = models.EventType.objects.get( short_name='licence') event.info_link = application_pdf_link event.guid = application_pdf_link event.address = address event.lng = latlng['lng'] event.lat = latlng['lat'] event.data = { 'applicant': applicant, 'application_type': application_type } event.save()
def handle(self, *args, **options): print "Scraping" now = datetime.now() url = 'http://www.beholder.co.uk/steam/' html = urllib.urlopen(url) page = BeautifulSoup.BeautifulSoup(html) brixton_row = page.find('tr', {'id': 'beholder_row_BRX'}) values = brixton_row.findAll('td')[1].findChildren() if values == []: print "NO TRAINS :(" else: print "TRAINS!" # get the time times = '' for value in values: times = value.text + ' ' #try and get details of train html = urllib.urlopen(values[0]['href']) page = BeautifulSoup.BeautifulSoup(html) service_name = page.find('table').find('td').find('font').find( 'b').text.title() # work out guid and see if already saved guid = "%s/%s" % (url, now.strftime('%Y-%m-%d:%H')) existing_events = models.Event.objects.filter(guid=guid) if len(existing_events) == 0: event = models.Event() event.message = "%s will be steaming through Brixton Station at %s" % ( service_name, times) event.event_type = models.EventType.objects.get( short_name='steamtrain') event.info_link = url event.guid = guid event.address = "32 Brixton Station Road, Brixton, London, SW9 8PE" event.lng = -0.11399 event.lat = 51.46327 event.data = {'service_name': service_name, 'time': times} event.save() print "saved %s " % event.message
def handle(self, *args, **options): print "Getting JSON" data = json.load( urllib.urlopen( 'http://openlylocal.com/councils/12/planning_applications.json' )) for application in data['planning_applications']: existing_events = models.Event.objects.filter( guid=application['url']) if len(existing_events) == 0: if geo.is_local({ 'lat': application['lat'], 'lng': application['lng'] }): address = application['address'] building_name = address.split(' London')[0] action = self.guess_type(application['description']) agent = self.get_agent(application) if agent == None: agent = 'Someone' message = "%s %s %s" % (agent, action, building_name) event = models.Event() event.message = message event.event_type = models.EventType.objects.get( short_name='planning') event.info_link = application['url'] event.guid = application['url'] event.address = address event.lng = application['lng'] event.lat = application['lat'] event.data = { 'agent': agent, 'description': application['description'], 'application type': application['application_type'] } event.save()