def cache_hansards(): for h in Hansard.objects.filter(url__icontains='http'): try: print "Loading %s..." % h hans.loadHansard(h) except Exception, e: print "Failure %s" % e
def hansards_from_calendar(session=None): if not session: session = Session.objects.current() SKIP_HANSARDS = { 'http://www2.parl.gc.ca/HousePublications/Publication.aspx?Language=E&Mode=2&Parl=36&Ses=2&DocId=2332160' : True, } url = 'http://www2.parl.gc.ca/housechamberbusiness/chambersittings.aspx?View=H&Parl=%d&Ses=%d&Language=E&Mode=2' % (session.parliamentnum, session.sessnum) #print "Getting calendar..." soup = BeautifulSoup(urllib2.urlopen(url)) #print "Calendar retrieved." cal = soup.find('div', id='ctl00_PageContent_calTextCalendar') for link in cal.findAll('a', href=True): hurl = 'http://www2.parl.gc.ca' + link['href'] if hurl in SKIP_HANSARDS: continue hurl = hurl.replace('Mode=2&', 'Mode=1&') #print "Loading url %s" % hurl try: hans.loadHansard(url=hurl, session=session) except Exception, e: print "Failure on %s: %s" % (hurl, e)