def GlueByContents(fout, url_contents, regmemdate): ur = urllib.urlopen(url_contents) sr = ur.read() ur.close() soup = BeautifulSoup.BeautifulSoup(sr) mps = soup.find('a', attrs={'name': 'A'}).parent.findNextSiblings('p') for p in mps: url = urlparse.urljoin(url_contents, p.a['href']) #print " reading " + url ur = urllib.urlopen(url) sr = ur.read() ur.close() if ur.code == 404: print "failed to fetch %s - skipping" % url continue # write the marker telling us which page this comes from lt = time.gmtime() fout.write('<page url="%s" scrapedate="%s" scrapetime="%s"/>\n' % \ (url, time.strftime('%Y-%m-%d', lt), time.strftime('%X', lt))) sr = re.sub('<p([^>]*)/>', r'<p\1></p>', sr) soup_mp = BeautifulSoup.BeautifulSoup(sr) page = soup_mp.find('h1').findNextSiblings(lambda t: t.name != 'div') page = '\n'.join([str(p) for p in page]) + '\n' miscfuncs.WriteCleanText(fout, page)
def GlueByNext(fout, url, regmemdate): # loop which scrapes through all the pages following the nextlinks starttablewritten = False sections = 0 while 1: print " reading " + url ur = urllib.urlopen(url) sr = ur.read() ur.close() sections += 1 # write the marker telling us which page this comes from lt = time.gmtime() fout.write('<page url="%s" scrapedate="%s" scrapetime="%s"/>\n' % \ (url, time.strftime('%Y-%m-%d', lt), time.strftime('%X', lt))) # split by sections hrsections = re.split( '<a name="top"></a>|' + '<!-- end of variable data -->|' + '(?i)', sr) # write the body of the text # for i in range(0,len(hrsections)): # print "------" # print hrsections[i] text = hrsections[1] m = re.search('<TABLE .*?>([\s\S]*)</TABLE>', text) if m: text = m.group(1) m = re.search('<TABLE .*?>([\s\S]*)', text) if m: text = m.group(1) if not starttablewritten and re.search('COLSPAN=4', text): text = "<TABLE>\n" + text starttablewritten = True miscfuncs.WriteCleanText(fout, text) # find the lead on with the footer footer = hrsections[2] nextsectionlink = re.findall( '<a href="([^>]*?)"><img border=0\s+align=top src="/pa/img/conu(?:grn|drd).gif" alt="continue"></a>', footer) if not nextsectionlink: break if len(nextsectionlink) > 1: raise Exception, "More than one Next Section!!!" url = urlparse.urljoin(url, nextsectionlink[0]) # you evidently didn't find any links assert sections > 1 fout.write('</TABLE>')
def GlueByNext(fout, urla, urlx, sdate): # put out the indexlink for comparison with the hansardindex file lt = time.gmtime() fout.write('<pagex url="%s" scrapedate="%s" scrapetime="%s"/>\n' % \ (urlx, time.strftime('%Y-%m-%d', lt), time.strftime('%X', lt))) if urla[0] == 'http://www.publications.parliament.uk/pa/ld200607/ldhansrd/text/61130-0001.htm': urla = [urla[0]] if urla[0] == 'http://www.publications.parliament.uk/pa/ld200607/ldhansrd/text/70125-0001.htm': urla = urla[2:] if urla[0] == 'http://www.publications.parliament.uk/pa/ld200506/ldhansrd/vo050517/text/50517-02.htm': urla.insert( 0, 'http://www.publications.parliament.uk/pa/ld200506/ldhansrd/vo050517/text/50517-01.htm' ) if urla[0] == 'http://www.publications.parliament.uk/pa/ld200405/ldhansrd/vo041123/text/41123-02.htm': urla.insert( 0, 'http://www.publications.parliament.uk/pa/ld200405/ldhansrd/vo041123/text/41123-01.htm' ) if urla[0] == 'http://www.publications.parliament.uk/pa/ld200708/ldhansrd/text/80722-0001.htm': urla = [urla[0]] if urla[0] == 'http://www.publications.parliament.uk/pa/ld200708/ldhansrd/text/81104-0001.htm': urla = [urla[0]] if urla[0] == 'http://www.publications.parliament.uk/pa/ld201011/ldhansrd/text/110119-0001.htm': urla = [urla[0]] # Missing header/footer, need to be able to find 2nd HTML page if urla[0] == 'http://www.publications.parliament.uk/pa/ld201213/ldhansrd/text/130327-0001.htm': urla.insert( 1, 'http://www.publications.parliament.uk/pa/ld201213/ldhansrd/text/130327-0002.htm' ) # loop which scrapes through all the pages following the nextlinks # knocking off the known links as we go in case a "next page" is missing. while urla: url = urla[0] ur = urllib.urlopen(url) sr = ur.read() ur.close() # write the marker telling us which page this comes from fout.write('<page url="' + url + '"/>\n') # To cope with post 2006-07-03, turn <body> into <hr> sr = re.sub('<body><notus', '<body><hr><notus', sr) #sr = re.sub('<body><br>', '<body><hr><br>', sr) sr = re.sub('<body><h3 align="center"', '<body><hr><h3 align="center"', sr) sr = re.sub('<body><p>', '<body><hr><p>', sr) # post 2006-09 sr = re.sub("</?mekonParaReplace[^>]*>", "", sr) sr = re.sub("</?mekonHrefReplace[^>]*>", "", sr) sr = re.sub("<meta[^>]*>", "", sr) sr = re.sub('<a name="([^"]*)" />', r'<a name="\1"></a>', sr) # Should be WriteCleanText like for Commons? sr = re.sub('(<a href="[^"]*&)(">.*?)(</a>)(;.*?)([ .,<])', r'\1\4\2\4\3\5', sr) sr = re.sub('<div id="maincontent1">\s+<notus', '<hr> <notus', sr) sr = re.sub('<div id="maincontent1">\s*<link[^>]*>\s*<notus', '<hr> <notus', sr) # New 2008-10... sr = re.sub('<div id="maincontent1">\s*<link[^>]*>\s*<h1', '<hr> <h1', sr) # New 2011-01... sr = re.sub('<div id="maincontent">(?:\s*<table.*?</table>)?(?s)', '', sr) if url in ( 'http://www.publications.parliament.uk/pa/ld200607/ldhansrd/text/71001w0001.htm', 'http://www.publications.parliament.uk/pa/ld201011/ldhansrd/text/110118-0001.htm', ): sr = re.sub('Daily Hansard</span></div>', 'Daily Hansard</span></div> <hr>', sr) # To deal with missing header/footer on this day. Might need removing if they come back? if url == 'http://www.publications.parliament.uk/pa/ld201213/ldhansrd/text/121105-wms0001.htm' or re.match( 'http://www.publications.parliament.uk/pa/ld201213/ldhansrd/text/130327', url): sr = re.sub('<body>', '<body> <hr>', sr) # For 2013-02-26, 2013-05-08, so far sr = re.sub('<div id="content-small"><!--end', '<div id="content-small"> <hr><!--end', sr) # 2012 summer recess hack #if re.match('http://www.publications.parliament.uk/pa/ld/ldtoday/writtens/..0[78]2012\.htm$', url): # sr = sr.replace('<div class="hansardContent">', '<hr><a name="column_WA0">').replace('<hr/>', '<hr>') # post 2008-03, stupid duplication of <b>s sr = re.sub('<b>((?:<a name="[^"]*"></a>)*)<b>', '\\1<b>', sr) sr = re.sub('</b><!--[^>]*--></b>', '</b>', sr) # split by sections hrsections = re.split('<hr[^>]*>(?i)', sr) # this is the case for debates on 2003-03-13 page 30 # http://www.publications.parliament.uk/pa/cm200203/cmhansrd/vo030313/debtext/30313-32.htm if len(hrsections) == 1: # special case for the Grand committee proceedings on 2011-03-23 if url == 'http://www.publications.parliament.uk/pa/ld201011/ldhansrd/text/110323-gc0001.htm': miscfuncs.WriteCleanText(fout, sr, False) else: # print len(hrsections), 'page missing', url # fout.write('<UL><UL><UL></UL></UL></UL>\n') print "Bridging the empty page at %s" % url urla = urla[1:] continue # Lords Written Statements on 2006-07-05, for example, sadly if len(hrsections) == 2: miscfuncs.WriteCleanText(fout, hrsections[1], False) # write the body of the text for i in range(1, len(hrsections) - 1): miscfuncs.WriteCleanText(fout, hrsections[i], False) # find the lead on with the footer footer = hrsections[-1] # the files are sectioned by the <hr> tag into header, body and footer. nextsectionlink = re.findall( '<\s*a\s+href\s*=\s*"?(.*?)"?\s*>next section</a>(?i)', footer) if len(nextsectionlink) > 1: raise Exception, "More than one Next Section!!!" if not nextsectionlink: urla = urla[1:] if urla: print "Bridging the missing next section link at %s" % url else: url = urlparse.urljoin(url, nextsectionlink[0]) # Specific case on 2011-02-23 if url == 'http://www.publications.parliament.uk/pa/ld201011/ldhansrd/text/110323-wms0001.htm': url = 'http://www.publications.parliament.uk/pa/ld201011/ldhansrd/text/110323-gc0001.htm' # this link is known if (len(urla) > 1) and (urla[1] == url): urla = urla[1:] # unknown link, either there's a gap in the urla's or a mistake. else: for uo in urla: if uo == url: print string.join(urla, "\n") print "\n\n" print url print "\n\n" raise Exception, "Next Section misses out the urla list" urla[0] = url pass #endwhile urla
def GlueByNext(fout, url, regmemdate): # loop which scrapes through all the pages following the nextlinks starttablewritten = False matcheddate = False if re.search("ldreg", url): matcheddate = True sections = 0 while 1: #print " reading " + url ur = urllib.urlopen(url) sr = ur.read() ur.close() sections += 1 # check date if not matcheddate: dateinpage = re.search("current as at\s*<[bB]>(.*)</[bB]>", sr) if not dateinpage: raise Exception, 'Not found date marker' dateinpage = dateinpage.group(1).replace(" ", " ") dateinpage = mx.DateTime.DateTimeFrom(dateinpage).date if dateinpage != regmemdate: raise Exception, 'Date in page is %s, expected %s - update the URL list in regmempullgluepages.py' % ( dateinpage, regmemdate) matcheddate = True # write the marker telling us which page this comes from lt = time.gmtime() fout.write('<page url="%s" scrapedate="%s" scrapetime="%s"/>\n' % \ (url, time.strftime('%Y-%m-%d', lt), time.strftime('%X', lt))) # split by sections hrsections = re.split( '<TABLE border=0 width="90%">|' + '</TABLE>\s*?<!-- end of variable data -->|' + '<!-- end of variable data -->\s*</TABLE>' + '(?i)', sr) # write the body of the text # for i in range(0,len(hrsections)): # print "------" # print hrsections[i] text = hrsections[2] m = re.search('<TABLE .*?>([\s\S]*)</TABLE>', text) if m: text = m.group(1) m = re.search('<TABLE .*?>([\s\S]*)', text) if m: text = m.group(1) if not starttablewritten and re.search('COLSPAN=4', text): text = "<TABLE>\n" + text starttablewritten = True miscfuncs.WriteCleanText(fout, text) # find the lead on with the footer footer = hrsections[3] nextsectionlink = re.findall( '<A href="([^>]*?)"><IMG border=0\s+align=top src="/pa/img/next(?:grn|drd).gif" ALT="next page"></A>', footer) if not nextsectionlink: break if len(nextsectionlink) > 1: raise Exception, "More than one Next Section!!!" url = urlparse.urljoin(url, nextsectionlink[0]) # you evidently didn't find any links assert sections > 10 fout.write('</TABLE>')
def GlueByNext(fout, urlx, billtitle): # put out the indexlink for comparison with the hansardindex file lt = time.gmtime() fout.write('<pagex url="%s" scrapedate="%s" scrapetime="%s" billtitle="%s"/>\n' % \ (urlx, time.strftime('%Y-%m-%d', lt), time.strftime('%X', lt), billtitle)) url = urlx year = int(re.search('cm(\d{4})', urlx).group(1)) if year >= 2010: pageheader = '<div id="content"' pagefooter = '<a name="end"/>' else: pageheader = '<img\s*src="/pa/img/portsgrn.gif"\s*alt="House\s*of\s*Commons\s*portcullis"><BR>' # there are various green button gifs, including two which say "continue", but with different filenames pagefooter = '<a href\s*=\s*"[^"]*">\s*<img border=0(?: align=top)? src="/pa/img/(?:ctntgrn|conugrn|prevgrn|contgrn).gif"' if re.search("/pa/cm200203/cmstand/d/st030401/am/30401s01.htm$", urlx): pageheader = "<!--end of UK Parliament banner for Publications -->" if re.search("/pa/cm200102/cmstand/d/st020115/am/20115s01.htm$", urlx): pageheader = "<!--end of UK Parliament banner for Publications -->" if re.search("/pa/cm200304/cmstand/c/st040428/pm/40428s01.htm$", urlx): pageheader = "<!--end of UK Parliament banner for Publications-->" if re.search("/pa/cm200203/cmstand/c/st030402/30402s01.htm$", urlx): pageheader = "<!--end of UK Parliament banner for Publications-->" if re.search("/pa/cm200102/cmstand/g/st020213/am/20213s01.htm$", urlx): pageheader = "<!--end of UK Parliament banner for Publications-->" if re.search("/pa/cm199900/cmstand/f/st000525/00525s10.htm#pm$", urlx): pageheader = "<a name=pm>" url = re.sub("#pm", "", url) if re.search("/pa/cm200910/cmpublic/bribery/100323/am", urlx): pageheader = '<div id="maincontent">' pagefooter = '<a name="end"/>' if re.search("/pa/cm200910/cmpublic/cooperativeandcommunity/100303/am", urlx): pagefooter = '<a name="end">' if re.search("/pa/cm200910/cmpublic/marriagewales/100224/pm", urlx): pagefooter = '<a name="end">' if re.search('/pa/cm200910/cmpublic/thirdparties/100316/am', urlx): pagefooter = '<a name="end">' if re.search("/pa/cm200910/cmpublic/gromarket/100330/am", urlx): pageheader = '<div id="maincontent">' pagefooter = '<a name="end"/>' # loop which scrapes through all the pages following the nextlinks # knocking off the known links as we go in case a "next page" is missing. while True: if re.search("/pa/cm199798/cmstand/b/st971106/am/71106s04.htm$", url): url = re.sub("s04.htm", "s05.htm", url) # skip over missing page ur = urllib.urlopen(url) sr = ur.read() ur.close() # write the marker telling us which page this comes from fout.write('<page url="' + url + '"/>\n') repagebody = '(?si).*?%s(.*?)%s' % (pageheader, pagefooter) mbody = re.match(repagebody, sr) if not mbody: if re.search("/pa/cm199899/cmstand/e/st990429/am/90429s03.htm$", url): # continuation does not exist break if re.search( "/pa/cm199899/cmstand/special/st990420/pm/pt3/90420s12.htm$", url): # continuation does not exist break if re.search("/pa/cm200203/cmstand/d/st031016/pm/31016s06.htm$", url): # continuation does not exist break print "\n", pageheader, "\n\n", pagefooter, "\n\n" print "header", re.search('(?si)' + pageheader, sr) print "footer", re.search('(?si)' + pagefooter, sr) print url print sr[:2000] assert False miscfuncs.WriteCleanText(fout, mbody.group(1), False) # the files are sectioned by the <hr> tag into header, body and footer. mnextsectionlink = re.search( '(?si)<\s*a\s+href\s*=\s*"?([^"]*?)"?\s*>\s*<img border=0 align=top src="/pa/img/conugrn.gif"', sr[mbody.end(1):]) #print " nextsectionlink", mnextsectionlink if not mnextsectionlink: break url = urlparse.urljoin(url, mnextsectionlink.group(1)) if miscfuncs.IsNotQuiet(): print " ", re.sub(".*?cmstand/", "", url) # second and subsequent pages pageheader = '<p align=right>\[<a href="[^"]*">back to previous text</a>\]' pass #endwhile urla