コード例 #1
0
def GetBillLinks():
    committees = []

    billyears = []
    uin = urllib.urlopen(url_pbc_previous)
    s = uin.read()
    uin.close()
    billyears = re.findall('<a href="([^"]*)"[^>]*>(Session .*?)</a>(?is)', s)

    for billyear in billyears:
        match = re.match("Session (\d\d\d\d)-\d\d(?:\d\d)?", billyear[1])
        if not match:
            raise Exception, "Did not find session dates in %s" % billyear[1]
        year = match.group(1)
        if miscfuncs.IsNotQuiet():
            print "year=", year
        for link, text in get_oldstyle_bill_links(billyear[0]):
            committees.append((year, link, text))

    committees += ((
        '2012',
        'http://services.parliament.uk/bills/2010-12/financeno4/committees/houseofcommonspublicbillcommitteeonthefinancebill201213.html',
        'Finance Bill'
    ), ('2014',
        'http://services.parliament.uk/bills/2013-14/finance/committees/houseofcommonspublicbillcommitteeonthefinancebill201415.html',
        'Finance Bill'))

    return get_committee_attributes(committees)
コード例 #2
0
ファイル: pullgluepages.py プロジェクト: samknight/parlparse
def CmIndexFromNewPage(date, type='commons'):
        if type=='lords':
                if date >= mx.DateTime.Date(2012,7,31) and date <= mx.DateTime.Date(2012,9,17): return []
                if date == mx.DateTime.Date(2012,10,1): return []
                urllinkpage = '%s?d=%s&m=%d&y=%d' % (url_bydate_index_lords, date.day, date.month, date.year)
        else:
                urllinkpage = '%s?d=%s&m=%d&y=%d' % (url_bydate_index, date.day, date.month, date.year)
        urlinkpage = urllib.urlopen(urllinkpage)
	srlinkpage = urlinkpage.read()
	urlinkpage.close()

        entries = []

	for link1 in re.findall('<a[^>]*?href="(http://www\.publications\.[^"#]+)(?:#[^"]*)?">([^<]*)</a>(?i)', srlinkpage):
                linkhref = link1[0]
                linktext = link1[1]
     		if not re.search('debate|westminster|written(?i)', linktext):
		        continue
		if linkhref.endswith('pdf'):
			if miscfuncs.IsNotQuiet():
				print "Skipping PDF: %s" % linktext
			continue

        	uind = re.sub('(?:\s|%20)', '', linkhref)
        	typ = re.sub('\s+', ' ', linktext).strip()
                if entries and entries[-1][1] == uind:
                        continue
	        entries.append((typ, uind))

        return entries
コード例 #3
0
ファイル: pullgluepages.py プロジェクト: samknight/parlparse
def PullGlueToday(forcescrape):
	# Fetch 'Today in the Commons' index page
	frontpagedata = fetchTextFromUrl(TodayInTheCommonsIndexPageUrl)
	link01url = re.search("<a href=\"(01\.htm)\">Go to Full Report</a>", frontpagedata).group(1)
	pageurl = urlparse.urljoin(TodayInTheCommonsIndexPageUrl, link01url)

	preparedDateMatch = re.search("<p class=\"prepared\">Prepared: <strong>(\d+:\d+) on (\d+ [a-zA-Z]+ \d+)</strong></p>", frontpagedata)
	preparedDateTime = mx.DateTime.DateTimeFrom(preparedDateMatch.group(1) + " " + preparedDateMatch.group(2))
	spreparedDateTime = "%s" % preparedDateTime  # convert to string (can't find the real way to do it)

	# extract the date from the browse links lower down
	headingDateMatch = re.search('''(?x)<h2>Browse\sReport\sBy\sSection</h2>\s*
										<ul>\s*
										<p\sclass="indextext"\salign=left><a\shref="01.htm\#hddr_1"><b>House\sof\sCommons</b></a></p>\s*
										<p\sclass="indextext"\salign=left><a\shref="01.htm\#hddr_2"><i>([^<]*)</i></a></p>''', frontpagedata)
	headingDateTime = mx.DateTime.DateTimeFrom(headingDateMatch.group(1))
	sdate = headingDateTime.date
	assert sdate <= preparedDateTime.date # prepared date must come after date from heading


	# make files which we will copy into
	lddaymap, pwcmfolder = MakeDayMap("debates", "debates")
	dgflatest, dgflatestdayalpha, dgfnext, dgfnextdayalpha = GetFileDayVersions(sdate, lddaymap, pwcmfolder, "debates")

	# See if we actually want to proceed with scraping, or if there already exists a 'printed' version
	# in which case we avoid replacing it with the 'today' version
	latestScrapedFileMetaData = readPageX(dgflatest)
	if latestScrapedFileMetaData.get('type')=='printed':
		print "'Printed' version of hansard for today has already been scraped. Skipping scrape of 'Today' version"
		return None
	if not forcescrape and latestScrapedFileMetaData.get('prepareddatetime') == spreparedDateTime:
		if miscfuncs.IsNotQuiet():
			print "Prepared datetime", spreparedDateTime, "already done"
		return None

	tempFileHandle = open(tempfilename, "w")
	tempFileHandle.write('<pagex url="%s" scrapedate="%s" scrapetime="%s" prepareddatetime="%s" type="today" />\n' % (TodayInTheCommonsIndexPageUrl, time.strftime('%Y-%m-%d', time.gmtime()), time.strftime('%X', time.gmtime()), spreparedDateTime))

	GlueByToday(tempFileHandle, pageurl)
	tempFileHandle.close()

	comp = CompareScrapedFiles(dgflatest, tempfilename)
	# now commit the file
	if comp == 'DIFFERENT':
		print "writing: ", dgfnext
		os.rename(tempfilename, dgfnext)
		return sdate
	elif comp == 'EXTENSION':
		print "OVER-writing: ", dgflatest
		shutil.copyfile(tempfilename, dgflatest)
		os.remove(tempfilename)
		return sdate
	else:
		assert comp == 'SAME'
		print "download exactly the same: ", dgflatest
		return None
コード例 #4
0
ファイル: pullgluepages.py プロジェクト: samknight/parlparse
def GlueByToday(outputFileHandle, pageurl):
	pagenumber=1
	while pageurl:
		assert pagenumber==int(re.search('(\d+)\.htm$', pageurl).group(1))
		preparedDateTime, nextLink, body = ScrapeTodayPage(pageurl)

		if miscfuncs.IsNotQuiet():
			print "Processed [%s] which was prepared [%s]" % (pageurl, preparedDateTime)
		now = time.gmtime()
		outputFileHandle.write('<page url="%s" prepareddatetime="%s" />\n' % (pageurl, preparedDateTime) )
		outputFileHandle.write(body)
		outputFileHandle.write('\n')

		if nextLink:
			pageurl = urlparse.urljoin(pageurl, nextLink)
		else:
			pageurl = None
		pagenumber += 1
コード例 #5
0
ファイル: pullgluepages.py プロジェクト: samknight/parlparse
def PullGluePages(options, folder, typ):
	daymap, scrapedDataOutputPath = MakeDayMap(folder, typ)

        scrape = []

        # Post 2010 election scraping done directly, not via index
        if options.dateto >= '2010-05-18':
                if options.datefrom > '2010-05-18':
                        date = mx.DateTime.DateTimeFrom(options.datefrom)
                else:
                        date = mx.DateTime.DateTimeFrom('2010-05-18')
                while date.date <= options.dateto and date < mx.DateTime.today():
                        for recordType, link in CmIndexFromNewPage(date):
                                if recordType == 'Written Statements': recordType = 'Written Ministerial Statements'
                                if recordType == 'Debates and Oral Answers': recordType = 'Debates'
		                if re.search(typ, recordType, re.I):
                                        scrape.append(CommonsIndexElement(date.date, recordType, link))
                        date += mx.DateTime.DateTimeDelta(1)

	# loop through the index file previously made by createhansardindex
	for commonsIndexRecord in CommonsIndex().res:
		# implement date range
		if not re.search(typ, commonsIndexRecord.recordType, re.I):
			continue
		if commonsIndexRecord.date < options.datefrom or commonsIndexRecord.date > options.dateto:
			continue
                scrape.append(commonsIndexRecord)


        for commonsIndexRecord in scrape:
		latestFilePath, latestFileStem, nextFilePath, nextFileStem = \
			GetFileDayVersions(commonsIndexRecord.date, daymap, scrapedDataOutputPath, typ)

		try:
			# hansard index page
			urlx = commonsIndexRecord.url
			if commonsIndexRecord.recordType == 'Votes and Proceedings' or commonsIndexRecord.recordType == 'questionbook':
				urla = [urlx]
				#FIXME - should we be detecting somehow? I don't think this bit is currently used.
				glue_function = GlueByNext
			else:
				urla, new_type_index = ProcessIndexUrl(urlx, latestFilePath, options.forcescrape)  # this checks the url at start of file
				glue_function = GlueByNextNew if new_type_index else GlueByNext
			if not urla:
				continue

			if miscfuncs.IsNotQuiet():
				print commonsIndexRecord.date, (latestFilePath and 'RE-scraping' or 'scraping'), re.sub(".*?cmhansrd/", "", urlx)

			# now we take out the local pointer and start the gluing
			glue_function(tempfilename, urla, urlx, commonsIndexRecord.date)

		except Exception, e:
			options.anyerrors = True
			if options.quietc:
				print e
				print "\tERROR! %s failed to scrape on %s, quietly moving to next day" % (typ, commonsIndexRecord.date)
				continue
			else:
				raise

		if CompareScrapedFiles(latestFilePath, tempfilename) == "SAME":
			if miscfuncs.IsNotQuiet():
				print "  matched with:", latestFilePath
			continue

		# before we copy over the file from tempfilename to nextFilePath, copy over the patch if there is one.
		ReplicatePatchToNewScrapedVersion(folder, latestFileStem, latestFilePath, nextFilePath, nextFileStem)

		# now commit the file
		os.rename(tempfilename, nextFilePath)

		# make the message
		print commonsIndexRecord.date, (latestFilePath and 'RE-scraped' or 'scraped'), re.sub(".*?cmpages/", "", nextFilePath)
コード例 #6
0
ファイル: pullgluepages.py プロジェクト: samknight/parlparse
def GlueByNext(outputFileName, urla, urlx, sdate):
	fout = open(outputFileName, "w")
	# put out the indexlink for comparison with the hansardindex file
	lt = time.gmtime()
	fout.write('<pagex url="%s" scrapedate="%s" scrapetime="%s" type="printed" />\n' % \
			(urlx, time.strftime('%Y-%m-%d', lt), time.strftime('%X', lt)))

        # Patches
        if sdate=='2006-05-09' and urla[0]=='http://www.publications.parliament.uk/pa/cm200506/cmhansrd/cm060508/text/60508w0308.htm':
                urla = urla[1:]
        if sdate=='2006-05-10' and urla[0]=='http://www.publications.parliament.uk/pa/cm200506/cmhansrd/cm060509/text/60510w0332.htm':
                urla = urla[1:]
        if urla[0]=='http://www.publications.parliament.uk/pa/cm200506/cmhansrd/vo060524/debtext/60524-0001.htm':
                urla = [urla[0]]
        if sdate=='2006-06-05' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200506/cmhansrd/cm060605/text/60605w0640.htm':
                urla = ['http://www.publications.parliament.uk/pa/cm200506/cmhansrd/cm060602/text/60602w0601.htm', 'http://www.publications.parliament.uk/pa/cm200506/cmhansrd/cm060605/text/60605w0602.htm'] + urla
        if sdate=='2006-06-07' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200506/cmhansrd/cm060607/text/60607w0001.htm':
                urla = urla[0:2] + ['http://www.publications.parliament.uk/pa/cm200506/cmhansrd/cm060607/text/60607w0003.htm'] + urla[2:]
        if sdate=='2006-06-14' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200506/cmhansrd/vo060614/halltext/60614h0001.htm':
                urla = [urla[0]]
        if sdate=='2006-06-13' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200506/cmhansrd/vo060613/halltext/60613h0001.htm':
                urla = [urla[0]]
        if sdate=='2006-07-17' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200506/cmhansrd/vo060717/text/60717w0001.htm':
                urla = [urla[0]]
        if sdate=='2006-10-30' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200506/cmhansrd/cm061030/text/61030w0001.htm':
                urla = [urla[0]]
        if re.match(r'http://www.publications.parliament.uk/pa/cm200506/cmhansrd/vo0(\d{5})/debtext/\1-0001.htm', urla[0]) and \
            (sdate=='2006-10-17' or sdate=='2006-10-26' or sdate=='2006-10-11' or sdate=='2006-07-12'):
                urla = [urla[0]]
        if sdate=='2006-11-21' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm061121/debtext/61121-0001.htm':
                urla = urla[0:11] + urla[13:] # Incorrect link in middle of index
        if sdate=='2007-03-28' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070328/halltext/70328h0001.htm':
                urla = [urla[0]]
        if sdate=='2007-04-24' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070424/debtext/70424-0001.htm':
                urla = urla[0:14] + urla[16:]
        if sdate=='2007-05-15' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070515/halltext/70515h0001.htm':
                urla = urla[0:4] + urla[6:]
        if urla[0] == 'http://www.publications.parliament.uk/pa/cm200506/cmhansrd/cm060614/halltext/60614h0178.htm':
                urla = [urla[0]]
        if re.match(r'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm0(\d{5})/debtext/\1-0001.htm', urla[0]) and \
            (sdate=='2007-10-15' or sdate=='2007-10-23' or sdate=='2007-10-09' or sdate=='2007-02-05' or sdate=='2007-03-26' or \
             sdate=='2007-01-15' or sdate=='2006-11-29' or sdate=='2006-11-22' or sdate=='2007-07-11' or sdate=='2007-07-05'):
                urla = [urla[0]]
        if sdate=='2007-10-01' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm071001/text/71001w0001.htm':
                urla = [urla[0]]
        if sdate=='2007-07-19' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070719/wmstext/70719m0001.htm':
                urla = [urla[0]]
        if sdate=='2008-01-24' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200708/cmhansrd/cm080124/halltext/80124h0001.htm':
                urla = [urla[0]]
        if re.match(r'http://www.publications.parliament.uk/pa/cm200809/cmhansrd/cm0(\d{5})/halltext/\1h0001.htm', urla[0]) and \
            (sdate=='2009-02-12' or sdate=='2009-02-24' or sdate=='2009-06-10'):
                urla = [urla[0]]
        if sdate=='2009-02-12' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200809/cmhansrd/cm090212/wmstext/90212m0001.htm':
                urla = [urla[0]]
        if sdate=='2010-02-23' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200910/cmhansrd/cm100223/wmstext/100223m0001.htm':
                urla = [urla[0]]
        if re.match(r'http://www.publications.parliament.uk/pa/cm200809/cmhansrd/cm0(\d{5})/text/\1w0001.htm', urla[0]) and \
            (sdate=='2009-02-09' or sdate=='2009-02-25' or sdate=='2009-02-26' or sdate=='2009-02-27' or sdate=='2009-09-01' or sdate=='2009-10-19' or sdate=='2009-06-01' or sdate=='2009-05-05'):
                urla = [urla[0]]
        if re.match(r'http://www.publications.parliament.uk/pa/cm200708/cmhansrd/cm0(\d{5})/text/\1w0001.htm', urla[0]) and \
            (sdate=='2008-04-21' or sdate=='2008-03-13' or sdate=='2008-01-28' or sdate=='2008-01-16' or sdate=='2008-01-14' or sdate=='2007-11-28'):
                urla = [urla[0]]
        if sdate=='2008-11-17' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200708/cmhansrd/cm081117/text/81117w0001.htm':
                urla = urla[0:27] + urla[29:]
        if re.match(r'http://www.publications.parliament.uk/pa/cm200708/cmhansrd/cm0(\d{5})/debtext/\1-0001.htm', urla[0]) and \
            (sdate=='2008-06-17' or sdate=='2008-07-07' or sdate=='2008-03-06' or sdate=='2008-01-14' or sdate=='2008-06-30' or sdate=='2008-11-20'):
                urla = [urla[0]]
        if re.match(r'http://www.publications.parliament.uk/pa/cm200809/cmhansrd/cm0(\d{5})/debtext/\1-0001.htm', urla[0]) and \
            (sdate=='2009-03-24' or sdate=='2009-06-30' or sdate=='2009-10-19' or sdate=='2009-07-20'):
                urla = [urla[0]]
        if re.match(r'http://www.publications.parliament.uk/pa/cm200910/cmhansrd/cm(\d{6})/debtext/\1-0001.htm', urla[0]) and \
            (sdate=='2010-04-08'):
                urla = [urla[0]]
        if re.match(r'http://www.publications.parliament.uk/pa/cm201011/cmhansrd/cm(\d{6})/debtext/\1-0001.htm', urla[0]) and \
            sdate in ('2010-09-06', '2011-01-25'):
                urla = [urla[0]]
        if re.match(r'http://www.publications.parliament.uk/pa/cm201011/cmhansrd/cm(\d{6})/wmsindx/\1-x.htm', urlx) and \
            sdate in ('2010-06-14', '2010-09-07', '2010-09-08', '2010-09-09', '2010-09-13', '2010-09-14', '2010-09-15', '2010-09-16', '2010-10-11', '2010-10-12', '2010-10-14'):
		# The first link in here points erroneously to wrans
		urla.pop(0)
        if re.match(r'http://www.publications.parliament.uk/pa/cm201011/cmhansrd/cm(\d{6})/debindx/\1-x.htm', urlx) and \
            sdate=='2010-10-19':
                urla = urla[:4]
	if re.match(r'http://www.publications.parliament.uk/pa/cm201011/cmhansrd/cm100906/index/100906-x.htm', urlx) and sdate=='2010-09-06':
		if urla[6] == 'http://www.publications.parliament.uk/pa/cm201011/cmhansrd/cm100906/text/100906w0004.htm':
			urla.pop(6)
		if urla[5] == 'http://www.publications.parliament.uk/pa/cm201011/cmhansrd/cm100906/text/100906w0013.htm':
			urla.pop(5)
		if urla[0] == 'http://www.publications.parliament.uk/pa/cm201011/cmhansrd/cm100906/text/100906w0013.htm':
			urla.pop(0)
	if re.match(r'http://www.publications.parliament.uk/pa/cm201011/cmhansrd/cm100713/debindx/100713-x.htm', urlx) and sdate=='2010-07-13':
		urla[1:6] = []

        #print "urla"
        #for aaa in urla:
        #        print aaa
        #sys.exit(1)

	# loop which scrapes through all the pages following the nextlinks
	while urla:
                # import pdb;pdb.set_trace()
                url2 = url = urla[0]
                if sdate=='2009-02-27':
                        url2 = re.sub('\s+', '', url2)
		#print " reading " + url
		ur = urllib.urlopen(url2)
		sr = ur.read()
		ur.close();

		# write the marker telling us which page this comes from
                if (url2 != urlx):
                        fout.write('<page url="' + url2 + '"/>\n')

                sr = re.sub('<!-- end of variable data -->.*<hr>(?si)', '<hr>', sr)

		# To cope with post 2006-05-08, turn <body> into <hr>
                sr = re.sub('<body><br>', '<body><hr><br>', sr)
                sr = re.sub('<body>\s+<notus', '<body><hr> <notus', sr)
                sr = re.sub('<body><h3 align="center"', '<body><hr><h3 align="center"', sr)
                sr = re.sub('<body><p>', '<body><hr><p>', sr)
                sr = re.sub('<body>\s+<!--<hd>--><br>', '<body><hr><!--<hd>--><br>', sr)
                
                # To cope with post 2006-09; need a better way of doing this!
                sr = re.sub('<div id="maincontent1">\s*<(p|br)>', r'<hr><\1>', sr)
                sr = re.sub('<div id="maincontent1">\s*<h3', '<hr><h3', sr)
                sr = re.sub('<div id="maincontent1">\s*<!--<hd>-->', '<hr>', sr)
                sr = re.sub('<div id="maincontent1">\s*<(notus|meta|a)', r'<hr> <\1', sr) # 2006-05-09 / 2006-10-20
                sr = re.sub('<div id="maincontent1">\s*<link[^>]*>\s*<(br|p|h3|notus|meta|a)', r'<hr><\1', sr) # 2008-06-17 / 2008-10...
                if sdate=='2006-11-07' or sdate=='2006-11-08':
                        sr = re.sub('<!--end of UK Parliament banner for Publications -->\s*<div class="breadcrumb">.*?</div>\s*<h2(?s)', '<hr> <h2', sr)
                sr = re.sub("</?mekon[^>]*>", "", sr)
                sr = re.sub("</?vbcrlf>", "", sr)

                # To cope with post 2011-03
                sr = re.sub('<div id="content-small">', '<div id="content-small"><hr/>', sr)

                # Make sure correction is before written answer question number - XXX right place?
                sr = re.sub('(\[\d+\])\s*((?:</p>)?)\s*(<a href="[^"]*corrtext[^"]*">.*?</a>)', r'\3 \1\2', sr)

		# split by sections
                # hrsections = re.split('<hr(?: size=3)>(?i)', sr)
                # hrsections = re.split('<hr(?: size=3)?(?: width="90%" align="left")?/?>(?i)', sr)
                hrsections = re.split('<hr[^>]*>(?i)', sr)
                # import pdb;pdb.set_trace()
                hrsections = [
                        re.sub('^\s*<table\s*width\s*=\s*"90%">\s*<tr>\s*<td>\s*(.*)</td>\s*</tr>\s*</table>\s*$(?s)', r'\1', x)
                        for x in hrsections
                        ]

		# this is the case for debates on 2003-03-13 page 30
		# http://www.publications.parliament.uk/pa/cm200203/cmhansrd/vo030313/debtext/30313-32.htm
		if len(hrsections) == 1:
                        if miscfuncs.IsNotQuiet():
			        print len(hrsections), 'page missing', url
			fout.write('<UL><UL><UL></UL></UL></UL>\n')
                        urla = urla[1:]
			continue

                # Grr, missing footers ALL OVER THE PLACE now
                if len(hrsections) == 2:
                        WriteCleanText(fout, hrsections[1], url, sdate)

		# write the body of the text
		for i in range(1,len(hrsections) - 1):
			WriteCleanText(fout, hrsections[i], url, sdate)

		# find the lead on with the footer
		footer = hrsections[-1]

		# the files are sectioned by the <hr> tag into header, body and footer.
		nextsectionlink = re.findall('<\s*a\s+href\s*=\s*"?(.*?)"?\s*>next(?: section)?</(?:a|td)>(?i)', footer)

		if len(nextsectionlink) > 1:
			raise Exception, "More than one Next Section!!!"
		if not nextsectionlink:
                        urla = urla[1:]
                        if urla and miscfuncs.IsNotQuiet():
                                print "Bridging the missing next section link at %s" % url
		else:
                        currenturl = url
                        url = urlparse.urljoin(url, nextsectionlink[0])
                        if len(urla) > 1 and urla[1] == url:
                                urla = urla[1:]
                        else:
                                for uo in urla:
                                        if uo == url:
                                                print "previous URLs:\n"
                                                print string.join(urla, "\n")
                                                print "\nbad next url:\n"
                                                print url
                                                print "\ncurrent url:\n"
                                                print currenturl
                                                raise Exception, "Next Section misses out the urla list"
                                urla[0] = url
		
	fout.close()
コード例 #7
0
ファイル: pullgluepages.py プロジェクト: samknight/parlparse
def GlueByNextNew(outputFileName, urla, urlx, sdate):
	fout = open(outputFileName, "w")
	# put out the indexlink for comparison with the hansardindex file
	lt = time.gmtime()
	fout.write('<pagex url="%s" scrapedate="%s" scrapetime="%s" type="printed" />\n' % \
			(urlx, time.strftime('%Y-%m-%d', lt), time.strftime('%X', lt)))

	if re.match(r'http://www.publications.parliament.uk/pa/cm201314/cmhansrd/cm140224/debindx/140224-x.htm', urlx) and sdate=='2014-02-24':
                urla = [urla[0]]
	if re.match(r'http://www.publications.parliament.uk/pa/cm201314/cmhansrd/cm140319/debindx/140319-x.htm', urlx) and sdate=='2014-03-19':
		urla = [urla[0]]
	if re.match(r'http://www.publications.parliament.uk/pa/cm201314/cmhansrd/cm140512/debindx/140512-x.htm', urlx) and sdate=='2014-05-12':
		urla = [urla[0]]
	if re.match(r'http://www.publications.parliament.uk/pa/cm201415/cmhansrd/cm140911/index/140911-x.htm', urlx) and sdate=='2014-09-11':
		urla = [urla[0]]
	if re.match(r'http://www.publications.parliament.uk/pa/cm201415/cmhansrd/cm150210/debindx/150210-x.htm', urlx) and sdate=='2015-02-10':
		urla = [urla[0]]
	if re.match(r'http://www.publications.parliament.uk/pa/cm201516/cmhansrd/cm151208/debindx/151208-x.htm', urlx) and sdate=='2015-12-08':
		urla = [urla[0]]
	if re.match(r'http://www.publications.parliament.uk/pa/cm201516/cmhansrd/cm151208/hallindx/151208-x.htm', urlx) and sdate=='2015-12-08':
		urla = [urla[0]]

	# loop which scrapes through all the pages following the nextlinks
	while urla:
                url = urla[0]
		ur = urllib.urlopen(url)
		sr = ur.read()
		ur.close()

		# write the marker telling us which page this comes from
                if (url != urlx):
                        fout.write('<page url="' + url + '"/>\n')

                # Hopefully this comment is always present now:
                content = re.sub('^.*?<!--end of page header-->(?s)', '', sr)
                # Pages bar first one have a <hr> before the main content table, but first page does not.
                # After line above, first <hr> will be at the end of the main content.
		# import pdb;pdb.set_trace()
                if '<div class="navLinks">' in content:
                    content = re.sub('<hr[^>]*>.*(?s)', '', content)
                else:
                    content = re.sub('</td>\s*</tr>\s*</table>\s*<hr[^>]*>.*(?s)', '', content)
                WriteCleanText(fout, content, url, sdate)

		nextsectionlink = re.findall('<\s*a\s+href\s*=\s*"?(.*?)"?\s*>next(?: section)?</(?:a|td)>(?i)', sr)
		if len(nextsectionlink) > 1:
			raise Exception, "More than one Next Section!!!"
		if not nextsectionlink:
                        urla = urla[1:]
                        if urla and miscfuncs.IsNotQuiet():
                                print "Bridging the missing next section link at %s" % url
		else:
                        currenturl = url
                        url = urlparse.urljoin(url, nextsectionlink[0])
                        if len(urla) > 1 and urla[1] == url:
                                urla = urla[1:]
                        else:
                                for uo in urla:
                                        if uo == url:
                                                print "previous URLs:\n", "\n".join(urla)
                                                print "\nbad next url:\n", url
                                                print "\ncurrent url:\n", currenturl
                                                raise Exception, "Next Section misses out the urla list"
                                urla[0] = url
	fout.close()
コード例 #8
0
def FactorChangesWrans(majblocks, scrapeversion):

    # we need to break the scrape version
    # we separate out and match the major headings separately
    # (anyway, these aren't really used)

    # and then match the questions

    # first extract all the oldtype gid-redirects that will have been put in here by the pre-2005 bMakeOldWransGidsToNew cases
    res = re.findall(
        '<gidredirect oldgid="[^"]*" newgid="[^"]*" matchtype="oldwransgid"/>\n',
        scrapeversion)

    # extract major headings and match injectively exactly (till we find a failed example).
    mhchks = re.findall(
        '<major-heading id="([^"]*)"[^>]*>\n\s*([\s\S]*?)\s*?\n</major-heading>',
        scrapeversion)

    majblocknames = [
        "".join(majblock[0].stext).strip() for majblock in majblocks
    ]
    for mhchk in mhchks:
        if mhchk[1] in majblocknames:
            i = majblocknames.index(mhchk[1])
            res.append(
                '<gidredirect oldgid="%s" newgid="%s" matchtype="perfectmatch"/>\n'
                % (mhchk[0], majblocks[i][0].qGID))
            majblocknames[i] = None  # take it out of circulation
        else:
            res.append(
                '<gidredirect oldgid="%s" newgid="%s" matchtype="removed"/>\n'
                % (mhchk[0], majblocks[0][0].qGID))

    # break into question blocks
    # [0]=headingGID, [1]=further choss, [2]=headingtext, [3]=question+reply text
    # the "<publicwhip> tags have been removed, so split to end of document
    qebchks = re.findall(
        '<minor-heading id="([^"]*)"([^>]*)>\n([\s\S]*?)</minor-heading>\n([\s\S]*?)\s*(?=<(?:major-heading|minor-heading|gidredirect[^>]*oldwranstype)|$)',
        scrapeversion)

    # make the map from qnums to blocks
    qnummissings = []
    qnummapq = {}
    for majblock in majblocks:
        for qblock in majblock[1]:
            for qnum in qblock.qnums:
                assert qnum not in qnummapq  # failure means this qnum is found twice in the newly parsed file.
                qnummapq[qnum] = qblock
                if re.match("ZZZZerror", qnum):
                    qnummissings.append(qnum)

    # for each block, find the map forward and check if we want to reprint it in full.
    for qebchk in qebchks:
        qqnums = re.findall('<p [^>]*?qnum="([\d\w]+)">', qebchk[3])
        assert qqnums

        # make sure that they all link to the same qnum in the new one
        qblock = None
        for qqnum in qqnums:
            if qblock:
                if qblock.headingqb.qGID != qnummapq[qqnum].headingqb.qGID:
                    print qblock.headingqb.qGID, qnummapq[qqnum].headingqb.qGID
                    assert qblock.headingqb.qGID == qnummapq[
                        qqnum].headingqb.qGID
            elif qqnum != '0' and qqnum in qnummapq:  # 0 is when there is a missing qnum
                qblock = qnummapq[qqnum]

        # in this case the qnums are fail for finding the match, so we either drop it, or find
        # the match by closest in text.  Prefer to match blocks to
        if not qblock:
            # find the closest match for this block out of this missing qnum blocks on the new page
            # (this will need to account for all blocks if in future the correction is to add in the qnum)
            if qnummissings:
                qmissblocksscore = []
                for qqnum in qnummissings:
                    similarity = MeasureBlockSimilarity(
                        qebchk[3], qnummapq[qqnum])
                    qmissblocksscore.append((similarity, qqnum))
                qmissblockscorebest = max(qmissblocksscore)
                qblock = qnummapq[qmissblockscorebest[1]]
                if miscfuncs.IsNotQuiet():
                    print "Missing qnum; mapping %s to %s with score %f" % (
                        qebchk[0], qblock.headingqb.qGID,
                        qmissblockscorebest[0])
                assert qmissblockscorebest[
                    0] > 0.8  # otherwise it's not really a match and we need to look harder.
                # perhaps it's matched to a block in the new file which newly has a qnum, and we then have to scan against all of them.

        # now have to check matching.
        # convert both to strings and compare.
        essxfq = [
        ]  # this forms the string which we will be comparing against.
        qebchkquesids = []  # expect only one of each
        qebchkreplids = []
        for wd in re.findall("<[^>]*>|&\w+;|[^<>\s]+", qebchk[3]):
            mwd = re.match('<(p|tr|reply|ques)\s*(?:p?id="([^"]*)")?[^>]*>',
                           wd)
            if mwd:
                essxfq.append("<%s>" % mwd.group(1))
                assert mwd.group(1) not in ("reply", "ques") or mwd.group(2)
                if mwd.group(1) == "ques":
                    qebchkquesids.append(mwd.group(2))
                elif mwd.group(1) == "reply":
                    qebchkreplids.append(mwd.group(2))

            elif not re.match("<gidredirect", wd):
                essxfq.append(wd)

        if not qblock and not qnummissings:
            res.append(
                '<gidredirect oldgid="%s" newgid="%s" matchtype="removed"/>\n'
                % (qebchk[0], majblocks[0][0].qGID))
            for qebq in qebchkquesids:
                res.append(
                    '<gidredirect oldgid="%s" newgid="%s" matchtype="removed"/>\n'
                    % (qebq, majblocks[0][0].qGID))
            for qebqr in qebchkreplids:
                res.append(
                    '<gidredirect oldgid="%s" newgid="%s" matchtype="removed"/>\n'
                    % (qebqr, majblocks[0][0].qGID))
            # Is the lred current-gidredirects bit needed here too? Don't think so, but not sure
            continue

        # build up the same summary from the question block
        essbkfq = []
        for qblockqr in (qblock.queses, qblock.replies):
            for qb in qblockqr:
                essbkfq.append("<%s>" % qb.typ)
                for wd in re.findall("<[^>]*>|&\w+;|[^<>\s]+",
                                     "\n".join(qb.stext)):
                    mwd = re.match("<(p|tr)[^>]*>", wd)
                    if mwd:
                        essbkfq.append("<%s>" % mwd.group(1))
                    elif not re.match("<gidredirect", wd):
                        essbkfq.append(wd)
                essbkfq.append("</%s>" % qb.typ)

        # print the link forwards
        bchanges = (essxfq != essbkfq)
        matchtype = bchanges and "changes" or "perfectmatch"
        if bchanges:
            res.append("\n")
        res.append('<gidredirect oldgid="%s" newgid="%s" matchtype="%s"/>\n' %
                   (qebchk[0], qblock.headingqb.qGID, matchtype))

        # write the parallel redirects for the question and reply (both mapping to same parts of each)
        # this may be more sophisticated once we see an example of failure
        # ultimately this is a job for paragraph matching

        # sometimes we get more than one question.
        # when we find a mismatch we'll deal with it as a special paragraph problem, or not bother.
        if len(qebchkquesids) != len(qblock.queses):
            print len(qebchkquesids), len(qblock.queses), qblock.queses[0].qGID
            assert len(qebchkquesids) == len(qblock.queses)
        for i in range(len(qebchkquesids)):
            res.append(
                '<gidredirect oldgid="%s" newgid="%s" matchtype="%s"/>\n' %
                (qebchkquesids[i], qblock.queses[i].qGID, matchtype))

        assert len(qebchkreplids) == len(qblock.replies) == 1
        for qebqr in qebchkreplids:
            res.append(
                '<gidredirect oldgid="%s" newgid="%s" matchtype="%s"/>\n' %
                (qebqr, qblock.replies[0].qGID, matchtype))

        # if changes write out the original, else just the gidmaps
        if bchanges:
            res.append('<minor-heading id="%s"%s>\n' % qebchk[0:2])
            res.append(qebchk[2])
            res.append('</minor-heading>\n')
            res.append(qebchk[3])
            res.append("\n\n")
        else:
            for lred in re.findall("<gidredirect[^>]*>\n", qebchk[3]):
                res.append("\t")
                res.append(lred)

    return res
コード例 #9
0
ファイル: pullgluepages.py プロジェクト: samknight/parlparse
def GlueByNext(fout, urlx, billtitle):
    # put out the indexlink for comparison with the hansardindex file
    lt = time.gmtime()
    fout.write('<pagex url="%s" scrapedate="%s" scrapetime="%s" billtitle="%s"/>\n' % \
      (urlx, time.strftime('%Y-%m-%d', lt), time.strftime('%X', lt), billtitle))
    url = urlx

    year = int(re.search('cm(\d{4})', urlx).group(1))
    if year >= 2010:
        pageheader = '<div id="content"'
        pagefooter = '<a name="end"/>'
    else:
        pageheader = '<img\s*src="/pa/img/portsgrn.gif"\s*alt="House\s*of\s*Commons\s*portcullis"><BR>'
        # there are various green button gifs, including two which say "continue", but with different filenames
        pagefooter = '<a href\s*=\s*"[^"]*">\s*<img border=0(?: align=top)? src="/pa/img/(?:ctntgrn|conugrn|prevgrn|contgrn).gif"'
    if re.search("/pa/cm200203/cmstand/d/st030401/am/30401s01.htm$", urlx):
        pageheader = "<!--end of UK Parliament banner for Publications -->"
    if re.search("/pa/cm200102/cmstand/d/st020115/am/20115s01.htm$", urlx):
        pageheader = "<!--end of UK Parliament banner for Publications -->"
    if re.search("/pa/cm200304/cmstand/c/st040428/pm/40428s01.htm$", urlx):
        pageheader = "<!--end of UK Parliament banner for Publications-->"
    if re.search("/pa/cm200203/cmstand/c/st030402/30402s01.htm$", urlx):
        pageheader = "<!--end of UK Parliament banner for Publications-->"
    if re.search("/pa/cm200102/cmstand/g/st020213/am/20213s01.htm$", urlx):
        pageheader = "<!--end of UK Parliament banner for Publications-->"
    if re.search("/pa/cm199900/cmstand/f/st000525/00525s10.htm#pm$", urlx):
        pageheader = "<a name=pm>"
        url = re.sub("#pm", "", url)
    if re.search("/pa/cm200910/cmpublic/bribery/100323/am", urlx):
        pageheader = '<div id="maincontent">'
        pagefooter = '<a name="end"/>'
    if re.search("/pa/cm200910/cmpublic/cooperativeandcommunity/100303/am",
                 urlx):
        pagefooter = '<a name="end">'
    if re.search("/pa/cm200910/cmpublic/marriagewales/100224/pm", urlx):
        pagefooter = '<a name="end">'
    if re.search('/pa/cm200910/cmpublic/thirdparties/100316/am', urlx):
        pagefooter = '<a name="end">'
    if re.search("/pa/cm200910/cmpublic/gromarket/100330/am", urlx):
        pageheader = '<div id="maincontent">'
        pagefooter = '<a name="end"/>'

    # loop which scrapes through all the pages following the nextlinks
    # knocking off the known links as we go in case a "next page" is missing.
    while True:
        if re.search("/pa/cm199798/cmstand/b/st971106/am/71106s04.htm$", url):
            url = re.sub("s04.htm", "s05.htm", url)  # skip over missing page

        ur = urllib.urlopen(url)
        sr = ur.read()
        ur.close()

        # write the marker telling us which page this comes from
        fout.write('<page url="' + url + '"/>\n')

        repagebody = '(?si).*?%s(.*?)%s' % (pageheader, pagefooter)
        mbody = re.match(repagebody, sr)
        if not mbody:
            if re.search("/pa/cm199899/cmstand/e/st990429/am/90429s03.htm$",
                         url):  # continuation does not exist
                break
            if re.search(
                    "/pa/cm199899/cmstand/special/st990420/pm/pt3/90420s12.htm$",
                    url):  # continuation does not exist
                break
            if re.search("/pa/cm200203/cmstand/d/st031016/pm/31016s06.htm$",
                         url):  # continuation does not exist
                break

            print "\n", pageheader, "\n\n", pagefooter, "\n\n"
            print "header", re.search('(?si)' + pageheader, sr)
            print "footer", re.search('(?si)' + pagefooter, sr)
            print url
            print sr[:2000]
            assert False

        miscfuncs.WriteCleanText(fout, mbody.group(1), False)
        # the files are sectioned by the <hr> tag into header, body and footer.
        mnextsectionlink = re.search(
            '(?si)<\s*a\s+href\s*=\s*"?([^"]*?)"?\s*>\s*<img border=0 align=top src="/pa/img/conugrn.gif"',
            sr[mbody.end(1):])
        #print "   nextsectionlink", mnextsectionlink
        if not mnextsectionlink:
            break
        url = urlparse.urljoin(url, mnextsectionlink.group(1))
        if miscfuncs.IsNotQuiet():
            print "  ", re.sub(".*?cmstand/", "", url)

        # second and subsequent pages
        pageheader = '<p align=right>\[<a href="[^"]*">back to previous text</a>\]'

    pass  #endwhile urla
コード例 #10
0
ファイル: pullgluepages.py プロジェクト: samknight/parlparse
def StandingPullGluePages(datefrom, dateto, bforcescrape):
    # make the output firectory
    if not os.path.isdir(pwstandingpages):
        os.mkdir(pwstandingpages)

    # load the index file previously made by createhansardindex
    cstandingindex = LoadStandingIndex(pwstandingindex)

    # scan through the directory and make a mapping of all the copies for each
    lshortnamemap = {}
    for ldfile in os.listdir(pwstandingpages):
        mnums = re.match("(standing.*?)([a-z]*)\.html$", ldfile)
        if mnums:
            lshortnamemap.setdefault(mnums.group(1), []).append(
                (AlphaStringToOrder(mnums.group(2)), mnums.group(2), ldfile))
        elif os.path.isfile(os.path.join(pwstandingpages, ldfile)):
            print "not recognized file:", ldfile, " in ", pwstandingpages

    # loop through the index of each lord line.
    for dnu in cstandingindex.res:
        # implement date range
        if dnu[2] < datefrom or dnu[2] > dateto:
            continue

        # make the filename
        dgflatestalpha, dgflatest = "", None
        if dnu[0] in lshortnamemap:
            ldgf = max(lshortnamemap[dnu[0]])
            dgflatestalpha = ldgf[1]
            dgflatest = os.path.join(pwstandingpages, ldgf[2])
        dgfnextalpha = NextAlphaString(dgflatestalpha)
        ldgfnext = '%s%s.html' % (dnu[0], dgfnextalpha)
        dgfnext = os.path.join(pwstandingpages, ldgfnext)
        assert not dgflatest or os.path.isfile(dgflatest)
        assert not os.path.isfile(dgfnext), dgfnext
        dgfnextstem = "%s%s" % (dnu[0], dgfnextalpha)
        dgflateststem = "%s%s" % (dnu[0], dgflatestalpha)

        # hansard index page
        urlx = dnu[1]

        # if not force scrape then we may choose to scrape it anyway
        # where the header doesn't match
        if not bforcescrape and dgflatest:
            fpgx = open(dgflatest, "r")
            pgx = fpgx.readline()
            fpgx.close()
            if pgx:
                pgx = re.findall('<pagex url="([^"]*)"[^/]*/>', pgx)
                if pgx:
                    if pgx[0] == urlx:
                        continue

        # make the message
        if miscfuncs.IsNotQuiet():
            print dnu[0], (dgflatest and 'RE-scraping'
                           or 'scraping'), re.sub(".*?cmstand/", "", urlx)
            print dnu[3]

        # now we take out the local pointer and start the gluing
        # we could check that all our links above get cleared.
        dtemp = open(tempfilename, "w")
        GlueByNext(dtemp, urlx, dnu[3])
        dtemp.close()

        # now we have to decide whether it's actually new and should be copied onto dgfnext.
        if dgflatest:  # the removal of \r makes testing sizes unreliable -- : and os.path.getsize(tempfilename) == os.path.getsize(dgflatest):
            # load in as strings and check matching
            fdgflatest = open(dgflatest)
            sdgflatest = fdgflatest.readlines()
            fdgflatest.close()

            fdgfnext = open(tempfilename)
            sdgfnext = fdgfnext.readlines()
            fdgfnext.close()

            # first line contains the scrape date
            if sdgflatest[1:] == sdgfnext[1:]:
                if miscfuncs.IsNotQuiet():
                    print "  matched with:", dgflatest
                continue

        ReplicatePatchToNewScrapedVersion('standing', dgflateststem, dgflatest,
                                          dgfnext, dgfnextstem)
        print dnu[0], (dgflatest and 'RE-scraped'
                       or 'scraped'), re.sub(".*?cmpages[/\\\\]", "", dgfnext)
        os.rename(tempfilename, dgfnext)