def MpTellerList(fsm, vote, stampurl, sdate): res = [ ] for fss in fsm: if fss == '</b>': continue # The end </b> on Tellers for the (Ayes|Noes): if fss == '<b> and</b>': continue # The 'and' now gets a paragraph of its own while fss: # split by lines, but linefeed sometimes missing gftell = re.match('\s*(?:and )?([ \w.\-\'&#;]*?)(?:\(([ \w.\-\'&#;]*)\))?(?: and(.*))?\s*\.?\s*$', fss) if not gftell: raise ContextException("no match on teller line", stamp=stampurl, fragment=fss) fssf = gftell.group(1) fssfcons = gftell.group(2) fss = gftell.group(3) if len(res) >= 2: print fsm raise ContextException(' too many tellers ', stamp=stampurl, fragment=fss) # It always is if fssf == 'Mr. Michael Foster': fssfcons = 'Worcester' (mpid, remadename, remadecons) = memberList.matchfullnamecons(fssf.strip(), fssfcons, sdate) #print fssf, " ++> ", remadename.encode("latin-1") if not mpid: raise ContextException("teller name bad match", stamp=stampurl, fragment=fssf) res.append('\t<mpname id="%s" vote="%s" teller="yes">%s</mpname>' % (mpid, vote, FixHTMLEntities(fssf))) return res
def TokenHonFriend(mhonfriend, phrtok): # will match for ids orgname = mhonfriend.group(2) res = memberList.matchfullnamecons(orgname, mhonfriend.group(1), phrtok.sdate, alwaysmatchcons = False) if not res[0]: # comes back as None nid = "unknown" mname = orgname else: nid = res[0] mname = res[1] assert not re.search("&", mname), mname # remove any xml entities from the name orgname = res[1] # if you put the .encode("latin-1") on the res[1] it doesn't work when there are strange characters. return ('phrase', (' class="honfriend" id="%s" name="%s"' % (nid, orgname)).encode("latin-1"))
def MpTellerList(fsm, vote, stampurl, sdate): res = [] for fss in fsm: if fss == '</b>': continue # The end </b> on Tellers for the (Ayes|Noes): if fss == '<b> and</b>': continue # The 'and' now gets a paragraph of its own while fss: # split by lines, but linefeed sometimes missing gftell = re.match( '\s*(?:and )?([ \w.\-\'&#;]*?)(?:\(([ \w.\-\'&#;]*)\))?(?: and(.*))?\s*\.?\s*$', fss) if not gftell: raise ContextException("no match on teller line", stamp=stampurl, fragment=fss) fssf = gftell.group(1) fssfcons = gftell.group(2) fss = gftell.group(3) if len(res) >= 2: print fsm raise ContextException(' too many tellers ', stamp=stampurl, fragment=fss) # It always is if fssf == 'Mr. Michael Foster': fssfcons = 'Worcester' (mpid, remadename, remadecons) = memberList.matchfullnamecons( fssf.strip(), fssfcons, sdate) #print fssf, " ++> ", remadename.encode("latin-1") if not mpid: raise ContextException("teller name bad match", stamp=stampurl, fragment=fssf) res.append( '\t<mpname person_id="%s" vote="%s" teller="yes">%s</mpname>' % (mpid, vote, FixHTMLEntities(fssf))) return res
def TokenHonFriend(mhonfriend, phrtok): # will match for ids orgname = mhonfriend.group(2) res = memberList.matchfullnamecons(orgname, mhonfriend.group(1), phrtok.sdate, alwaysmatchcons=False) if not res[0]: # comes back as None nid = "unknown" mname = orgname else: nid = res[0] mname = res[1] assert not re.search("&", mname), mname # remove any xml entities from the name orgname = res[1] # if you put the .encode("latin-1") on the res[1] it doesn't work when there are strange characters. return ('phrase', (' class="honfriend" person_id="%s" name="%s"' % (nid, orgname)).encode("latin-1"))
#! /usr/bin/python # -*- coding: utf-8 -*- import sys from resolvemembernames import memberList from lords.resolvenames import lordsList print memberList.matchfullnamecons(u"Si\xf4n Simon", "Birmingham Erdington", "2006-01-22") sys.exit(0) print lordsList.GetLordIDfname('Baroness Thatcher', None, '2006-05-01') print lordsList.GetLordIDfname('The Archbishop of York', None, '2006-05-01') print lordsList.GetLordIDfname('The Bishop of Southwell and Nottingham', None, '2006-05-01') print memberList.matchfullnamecons("Anne Moffat", "East Lothian", "2006-01-22") print memberList.matchfullnamecons("Anne Picking", "East Lothian", "2006-01-22") print memberList.matchfullnamecons("Anne Moffat", "East Lothian", "2004-01-22") print memberList.matchfullnamecons("Anne Picking", "East Lothian", "2004-01-22") print memberList.canonicalcons("Aberdeen North", "2001-01-01") print memberList.canonicalcons("Aberdeen North", "2005-05-06") print memberList.matchdebatename("Solicitor-General", None, "2003-11-21") print memberList.matchdebatename("The Advocate-General for Scotland", None, "2004-07-30") print memberList.getmembersoneelection("uk.org.publicwhip/member/1238") print memberList.getmembersoneelection("uk.org.publicwhip/member/1353") print memberList.getmembersoneelection("uk.org.publicwhip/member/1357") print memberList.matchdebatename("Mr. Mackay", None, "2003-11-21") print memberList.matchdebatename("James Marshall", None, "2003-11-21")
print "name, constituency, email, fax, phone, constituencyfax" for row in csvreader: if row == ["</b>"]: break origname, region, email, fax, phone, constituencyfax, image_file = map(string.strip, row) # ambiguous names cons = None if origname == "Mr Gareth Thomas": cons = "Clwyd West" if origname == "Mr Gareth R. Thomas": cons = "Harrow West" if origname == "Mr Michael Foster": cons = "Hastings and Rye" if origname == "Mr Michael J. Foster": cons = "Worcester" if origname == "Mr Anthony D. Wright": cons = "Great Yarmouth" if origname == "Dr Tony Wright": cons = "Cannock Chase" id, name, cons = memberList.matchfullnamecons(origname, cons, date_today) if id == None: raise Exception("Failed to match '%s'" % origname) row = [name, cons, email, fax, phone, constituencyfax] row = [x.encode("latin-1") for x in row]; csvwriter.writerow(row);
# Copyright (C) 2009 Matthew Somerville # This is free software, and you are welcome to redistribute it under # certain conditions. However, it comes with ABSOLUTELY NO WARRANTY. # For details see the file LICENSE.html in the top level of the source. import datetime import sys import urlparse import re sys.path.append("../pyscraper") from resolvemembernames import memberList today = '2010-04-12' page = open('../rawdata/MPs_standing_down_in_2010').read() print '''<?xml version="1.0" encoding="ISO-8859-1"?> <publicwhip>''' m = re.findall('<li><a href="([^"]*)"[^>]*>([^<]*)</a>', page) for row in m: url, name = row name = name.decode('utf-8') if name in ('Iris Robinson', 'Ashok Kumar', 'David Taylor'): continue id, canonname, canoncons = memberList.matchfullnamecons(name, None, today) pid = memberList.membertoperson(id) print (' <personinfo id="%s" name="%s" standing_down="1" />' % (pid, name)).encode('iso-8859-1') print '</publicwhip>'
print '''<?xml version="1.0" encoding="ISO-8859-1"?> <publicwhip> ''' ih = open(input, 'r') c = 0 for l in ih: c = c + 1 origname, origcons, personurl, consurl = map(string.strip, l.split("\t")) origname = re.sub("^(.*), (.*)$", '\\2 \\1', origname) # Match the name, and output basic URLs print >>sys.stderr, "Working on %s %s" % (origname, origcons) id, name, cons = memberList.matchfullnamecons(origname, origcons, date) #print >>sys.stderr, "ID %s name %s cons %s" % (id, name, cons) personid = memberList.membertoperson(id) cons = cons.replace("&", "&") print '<personinfo id="%s" guardian_mp_summary="%s" />' % (personid, personurl) url_match = re.search('^http://www.guardian.co.uk/politics/person/(\d+)/(.*)$', personurl) guardian_aristotle_id = url_match.group(1) print '<personinfo id="%s" guardian_aristotle_id="%s" />' % (personid, guardian_aristotle_id) print '<consinfo canonical="%s" guardian_election_results="%s" />' % (cons.encode("latin-1"), consurl) # Majority setsameelection = memberList.getmembersoneelection(id) #print setsameelection # Grab swing from the constituency page
def RunRegmemFilters(fout, text, sdate, sdatever): if sdate >= '2010-09-01': return RunRegmemFilters2010(fout, text, sdate, sdatever) # message for cron so I check I'm using this print "New register of members interests! Check it is working properly (via mpinfoin.pl) - %s" % sdate text = ApplyFixSubstitutions(text, sdate, fixsubs) WriteXMLHeader(fout) fout.write("<publicwhip>\n") text = re.sub('Rt Shaun', 'Shaun', text) # Always get his name wrong text = re.sub('€', '£', text) # Always get some pound signs wrong rows = re.findall("<TR>(.*)</TR>", text) rows = [re.sub(" ", " ", row) for row in rows] rows = [re.sub("<B>|</B>|<BR>|`", "", row) for row in rows] rows = [ re.sub('<span style="background-color: #FFFF00">|</span>', '', row) for row in rows ] rows = [re.sub('<IMG SRC="3lev.gif">', "", row) for row in rows] rows = [re.sub("­", "-", row) for row in rows] rows = [ re.sub('\[<A NAME="n\d+"><A HREF="\#note\d+">\d+</A>\]', '', row) for row in rows ] rows = [re.sub('\[<A NAME="n\d+">\d+\]', '', row) for row in rows] # Fix incorrect tabling of categories when highlighting is in play rows = [ re.sub('<TD COLSPAN=4>(\d\.) ([^<]*?)</TD>', r'<TD>\1</TD><TD COLSPAN=3>\2</TD>', row) for row in rows ] # split into cells within a row rows = [re.findall("<TD.*?>\s*(.*?)\s*</TD>", row) for row in rows] memberset = set() needmemberend = False category = None categoryname = None subcategory = None for row in rows: striprow = re.sub('</?[^>]+>', '', "".join(row)) #print row if striprow.strip() == "": # There is no text on the row, just tags pass elif len(row) == 1 and re.match("(?i)(<i>)? +(</i>)?", row[0]): # <TR><TD COLSPAN=4> </TD></TR> pass elif len(row) == 1: # <TR><TD COLSPAN=4><B>JACKSON, Robert (Wantage)</B></TD></TR> res = re.search("^([^,]*), ([^(]*) \((.*)\)$", row[0]) if not res: print row raise ContextException, "Failed to break up into first/last/cons: %s" % row[ 0] (lastname, firstname, constituency) = res.groups() constituency = constituency.replace(')', '') constituency = constituency.replace('(', '') firstname = memberList.striptitles(firstname)[0] # Register came out after they stood down if (firstname == 'Ian' and lastname == 'GIBSON' and sdate > '2009-06-08') \ or (firstname == 'Michael' and lastname == 'MARTIN' and sdate > '2009-06-22'): check_date = '2009-06-08' else: check_date = sdate (id, remadename, remadecons) = memberList.matchfullnamecons( firstname + " " + memberList.lowercaselastname(lastname), constituency, check_date) if not id: raise ContextException, "Failed to match name %s %s (%s) date %s" % ( firstname, lastname, constituency, sdate) if category: fout.write('\t</category>\n') if needmemberend: fout.write('</regmem>\n') needmemberend = False fout.write(('<regmem personid="%s" membername="%s" date="%s">\n' % (id, remadename, sdate)).encode("latin-1")) memberset.add(id) needmemberend = True category = None categoryname = None subcategory = None elif len(row) == 2 and row[0] == '' and re.match('Nil\.\.?', row[1]): # <TR><TD></TD><TD COLSPAN=3><B>Nil.</B></TD></TR> fout.write('Nil.\n') elif len(row) == 2 and row[0] != '': # <TR><TD><B>1.</B></TD><TD COLSPAN=3><B>Remunerated directorships</B></TD></TR> if category: fout.write('\t</category>\n') digits = row[0] category = re.match("\s*(\d\d?)\.$", digits).group(1) categoryname = row[1] subcategory = None fout.write('\t<category type="%s" name="%s">\n' % (category, categoryname)) elif len(row) == 2 and row[0] == '': # <TR><TD></TD><TD COLSPAN=3><B>Donations to the Office of the Leader of the Liberal Democrats received from:</B></TD></TR> if subcategory: fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, FixHTMLEntities(row[1]))) else: fout.write('\t\t<item>%s</item>\n' % FixHTMLEntities(row[1])) elif len(row) == 3 and row[0] == '' and row[1] == '': # <TR><TD></TD><TD></TD><TD COLSPAN=2>19 and 20 September 2002, two days fishing on the River Tay in Scotland as a guest of Scottish Coal. (Registered 3 October 2002)</TD></TR> if subcategory: fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, FixHTMLEntities(row[2]))) else: fout.write('\t\t<item>%s</item>\n' % FixHTMLEntities(row[2])) elif len(row) == 3 and row[0] == '': # <TR><TD></TD><TD><B>(a)</B></TD><TD COLSPAN=2>Smithville Associates; training consultancy.</TD></TR> if subcategory: fout.write( '\t\t<item subcategory="%s">%s</item>\n' % (subcategory, FixHTMLEntities(row[1] + ' ' + row[2]))) else: fout.write('\t\t<item>%s</item>\n' % FixHTMLEntities(row[1] + ' ' + row[2])) elif len(row) == 4 and row[0] == '' and (row[1] == '' or row[1] == '<IMG SRC="3lev.gif">'): # <TR><TD></TD><TD></TD><TD>(b)</TD><TD>Great Portland Estates PLC</TD></TR> subcategorymatch = re.match("\(([ab])\)$", row[2]) if not subcategorymatch: content = FixHTMLEntities(row[2] + " " + row[3]) if subcategory: fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, content)) else: fout.write('\t\t<item>%s</item>\n' % content) else: subcategory = subcategorymatch.group(1) fout.write('\t\t(%s)\n' % subcategory) fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, FixHTMLEntities(row[3]))) else: print row raise ContextException, "Unknown row type match, length %d" % ( len(row)) if category: fout.write('\t</category>\n') if needmemberend: fout.write('</regmem>\n') needmemberend = False membersetexpect = set( [m['person_id'] for m in memberList.mpslistondate(sdate)]) # check for missing/extra entries missing = membersetexpect.difference(memberset) if len(missing) > 0: print "Missing %d MP entries:\n" % len(missing), missing extra = memberset.difference(membersetexpect) if len(extra) > 0: print "Extra %d MP entries:\n" % len(extra), extra fout.write("</publicwhip>\n")
def RunRegmemFilters2010(fout, text, sdate, sdatever): print "2010-? new register of members interests! Check it is working properly (via mpinfoin.pl) - %s" % sdate WriteXMLHeader(fout) fout.write("<publicwhip>\n") memberset = set() text = re.sub('<span class="highlight">([^<]*?)</span>', r'\1', text) t = BeautifulStoneSoup(text) for page in t('page'): title = page.h2.renderContents() if title in ('HAGUE, Rt Hon William (Richmond (Yorks)', 'PEARCE, Teresa (Erith and Thamesmead'): title += ')' res = re.search("^([^,]*), ([^(]*) \((.*)\)\s*$", title) if not res: raise ContextException, "Failed to break up into first/last/cons: %s" % title (lastname, firstname, constituency) = res.groups() firstname = memberList.striptitles(firstname.decode('utf-8'))[0] lastname = lastname.decode('utf-8') if sdate < '2015-06-01': lastname = memberList.lowercaselastname(lastname) constituency = constituency.decode('utf-8') lastname = lastname.replace(u'O\u2019brien', "O'Brien") # Hmm (id, remadename, remadecons) = memberList.matchfullnamecons(firstname + " " + lastname, constituency, sdate) if not id: raise ContextException, "Failed to match name %s %s (%s) date %s\n" % (firstname, lastname, constituency, sdate) fout.write(('<regmem personid="%s" membername="%s" date="%s">\n' % (id, remadename, sdate)).encode("latin-1")) memberset.add(id) category = None categoryname = None subcategory = None record = False for row in page.h2.findNextSiblings(): text = row.renderContents().decode('utf-8').encode('iso-8859-1', 'xmlcharrefreplace') if row.get('class') == 'spacer': if record: fout.write('\t\t</record>\n') record = False continue if not text or re.match('\s*\.\s*$', text): continue if text == '<strong>%s</strong>' % title: continue if re.match('\s*Nil\.?\s*$', text): fout.write('Nil.\n') continue # Since 2015 election, register is all paragraphs, no headings :( if row.name == 'h3' or row.get('class') == 'shd0' or re.match('<strong>\d+\. ', text): if re.match('\s*$', text): continue m = re.match("(?:\s*<strong>)?\s*(\d\d?)\.\s*(.*)(?:</strong>\s*)?$", text) if m: if record: fout.write('\t\t</record>\n') record = False if category: fout.write('\t</category>\n') category, categoryname = m.groups() subcategory = None categoryname = re.sub('<[^>]*>(?s)', '', categoryname).strip() fout.write('\t<category type="%s" name="%s">\n' % (category, categoryname)) continue if not record: fout.write('\t\t<record>\n') record = True subcategorymatch = re.match("\s*\(([ab])\)\s*(.*)$", text) if subcategorymatch: subcategory = subcategorymatch.group(1) fout.write('\t\t\t(%s)\n' % subcategory) fout.write('\t\t\t<item subcategory="%s">%s</item>\n' % (subcategory, subcategorymatch.group(2))) continue if subcategory: fout.write('\t\t\t<item subcategory="%s">%s</item>\n' % (subcategory, text)) else: fout.write('\t\t\t<item>%s</item>\n' % text) if record: fout.write('\t\t</record>\n') record = False if category: fout.write('\t</category>\n') fout.write('</regmem>\n') membersetexpect = set([m['person_id'] for m in memberList.mpslistondate(sdate)]) # check for missing/extra entries missing = membersetexpect.difference(memberset) if len(missing) > 0: print "Missing %d MP entries:\n" % len(missing), missing extra = memberset.difference(membersetexpect) if len(extra) > 0: print "Extra %d MP entries:\n" % len(extra), extra fout.write("</publicwhip>\n")
if made[0:4] != "2004": continue made_date = mx.DateTime.DateTimeFrom(made).date constituency = constituency.replace('\\', '') mp_name = mp_name.replace('\\', '') if constituency == "South Tomshire": # better keep rosa's membership of parliament secret continue if constituency == "Trumpton": # i didn't know james was religious continue if constituency == "Stefstown": # i didn't know stef was knighted continue try: mp_id, name, cons = memberList.matchfullnamecons( mp_name, constituency, made_date) except Exception, e: print >> sys.stderr, "FaxYourMP name match failed", e else: if not mp_id: print >> sys.stderr, "FaxYourMP name match failed %s, %s" % ( mp_name, constituency) else: id = memberList.membertoperson(mp_id) if vote.lower() == "no": nohash[id] = nohash.get(id, 0) + 1 elif vote.lower() == "yes" or vote.lower() == "yes" + chr(160): yeshash[id] = yeshash.get(id, 0) + 1 elif vote == "": # print >>sys.stderr, "Blank vote" # Ignore for now
def MpList(fsm, vote, stampurl, sdate): # Merge lone listed constituencies onto end of previous line newfsm = [] for fss in fsm: if not fss: continue if reconstnm.match(fss): # print "constnm only %s appending to previous line %s" % (fss, newfsm[-1]) newfsm[-1] += " " + fss else: newfsm.append(fss) res = [ ] pfss = '' multimatches = { } # from tuple to number of matches accounted, and name for fss in newfsm: #print "fss ", fss # break up concattenated lines # Beresford, Sir PaulBlunt, Crispin while re.search('\S', fss): # there was an & in [A-Z] on line below, but it broke up this incorrectly: # Simon, Siôn <i>(B'ham Erdington)</i> regsep = re.search('(.*?,.*?(?:[a-z]|</i>|\.|\)))([A-Z].*?,.*)$', fss) regsep2 = re.match('(.*?,.*?) ([A-Z].*?,.*)$', fss) if regsep and not re.search(' Mc$', regsep.group(1)): fssf = regsep.group(1) fss = regsep.group(2) elif regsep2: fssf = regsep2.group(1) fss = regsep2.group(2) else: fssf = fss fss = '' # check alphabetical - but "rh" and so on confound so don't bother #if pfss and (pfss > fssf): # print pfss, fssf # raise Exception, ' out of alphabetical order %s and %s' % (pfss, fssf) #pfss = fssf # flipround the name # Bradley, rh Keith <i>(Withington)</i> # Simon, Sio(r)n <i>(Withington)</i> #print "fssf ", fssf ginp = reflipname.match(fssf) if ginp: #print "grps ", ginp.groups() fnam = '%s %s' % (ginp.group(2), ginp.group(1)) cons = ginp.group(3) # name not being flipped, is firstname lastname else: ginp = renoflipname.match(fssf) if not ginp: raise ContextException("No flipped or non-flipped name match (filterdivision)", stamp=stampurl, fragment=fssf) fnam = ginp.group(1); cons = ginp.group(2); #print "fss ", fssf (mpid, remadename, remadecons) = memberList.matchfullnamecons(fnam, cons, sdate, alwaysmatchcons = False) if not mpid and remadename == "MultipleMatch": assert type(remadecons) == tuple # actually the list of ids i = len(multimatches.setdefault(remadecons, [])) # the index we work with if i >= len(remadecons): print "Name", fnam, "used too many times for list", remadecons, "where other instances are", multimatches[remadecons] raise ContextException("Too many instances", stamp=stampurl, fragment=fnam) mpid = remadecons[i] multimatches[remadecons].append(fnam) # appears with multiple matching which is ignorable when both ambiguous people vote on same side of a division #print "For name", fnam, "returning id", mpid, ";", i, " out of ", remadecons elif not mpid and remadename != "MultipleMatch": print "filterdivision.py: no match for", fnam, cons, sdate raise ContextException("No match on name", stamp=stampurl, fragment=fnam) #print fnam, " --> ", remadename.encode("latin-1") res.append('\t<mpname id="%s" vote="%s">%s</mpname>' % (mpid, vote, FixHTMLEntities(fssf))) # now we have to check if the multimatched names were all exhausted for ids in multimatches: if len(multimatches[ids]) != len(ids): print "Insufficient vote matches on name", multimatches[ids], "ids taken to", ids raise ContextException("Not enough vote match on ambiguous name", stamp=stampurl, fragment=multimatches[ids][0]) return res
for line in content: line = line.strip() if not line or re.match('#', line): continue cols = line.split("\t") name = cols[0] m = re.match('(.*?), (.*)$', name) name = '%s %s' % (m.group(2), m.group(1)) money = cols[1:16] money = map(lambda x: re.sub("\xa3", "", x), money) money = map(lambda x: re.sub(",", "", x), money) id = None cons = None if name == 'Mr Michael Foster': cons = 'Worcester' id, name, cons = memberList.matchfullnamecons(name, cons, yeardate) #if not id: # id, name, newcons = memberList.matchfullnamecons(first + ' ' + last, cons, otheryeardate) if not id: raise Exception, "Failed to find MP in line %s" % line pid = memberList.membertoperson(id) # print >>sys.stderr, last, first, money if id in expmembers: print >> sys.stderr, "Ignored repeated entry for ", id else: fout.write('<personinfo id="%s" ' % pid) for i in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]: if i == 0 or i == 1 or i == 2 or i == 3: col = i + 1 elif i == 4: col = '5a'
def FilterWransSpeakers(fout, text, sdate): text = ApplyFixSubstitutions(text, sdate, fixsubs) # Fix things like this, to put bold in. We use bold below to detect names, but # occasionally the reporters miss it out, and we catch such cases here: # <p><a name="qnpa_0">Caroline Flint: This information is not held centrally. </p> # <p><a name="qnpa_15">Ms Harman: The information can be found in the following table. </p> missingbolds = re.findall( '(\n?<p>(?:<stamp aname="[^"]+"/>)+)((?:<b></b>)?\s*)([A-Za-z.\-\s]+)(:\s)', text) for p1, p2, p3, p4 in missingbolds: missingbold = "%s%s%s%s" % (p1, p2, p3, p4) bold = "%s<b>%s%s</b>" % (p1, p3, p4) namematches = memberList.fullnametoids(p3, sdate) # Only fix if we found a matching name in the middle (and do it even if ambiguous) if namematches: #print "Fixing missing bold, had name matches:\n\t%s\n\t%s" % (missingbold.strip(), bold.strip()) if not missingbold in text: print "ERROR: missing bold text found, but then vanished when replacing" text = text.replace(missingbold, bold) #else: #print "Plausible missing bold not fixed, as no name matches:\n\t%s\n\t%s" % (missingbold.strip(), bold.strip()) # <B> Mrs. Iris Robinson: </B> lspeakerregexp = '<b>.*?</b>(?:\s*:)?' ltableregexp = '<table[^>]*>[\s\S]*?</table>' # these have bolds, so must be separated out tableregexp = ltableregexp + '(?i)' lregexp = '(%s|%s)(?i)' % (ltableregexp, lspeakerregexp) # setup for scanning through the file. fs = re.split(lregexp, text) # for error messages stampurl = StampUrl(sdate) for i in range(len(fs)): fss = fs[i] fss = stampurl.UpdateStampUrl(fss) # Speakers have new stamps in them if re.match(tableregexp, fss): continue speakerg = re.findall('<b>\s*([^:]*)[:\s]*?([^<:]*)</b>(?i)', fss) if not speakerg: continue # we have a string in bold boldnamestring = string.strip(speakerg[0][0]) # trailing text after the colon in the bold speech bit if re.search('\S', speakerg[0][1]): fs[i + 1] = speakerg[0][1] + fs[i + 1] # push the square brackets outside of the boldstring if there is one # <B> Mr. Miliband [ </B> <i>holding answer 24 March</i>]: sqb = re.findall('^([^\[]*)(\[.*)$', boldnamestring) if sqb: boldnamestring = string.strip(sqb[0][0]) fs[i + 1] = sqb[0][1] + fs[i + 1] # get rid of blank bold strings if not re.search('\S', boldnamestring): fs[i] = '' continue # try to pull in the question number if preceding # These signify aborted oral questions, and are normally # useless and at the start of the page. # 27. <B> Mr. Steen: </B> if i > 0: oqnsep = re.findall( '^([\s\S]*?)Q?(\d+\.?)(\s*?(?:<stamp aname=".*?"/>)?)$', fs[i - 1]) if oqnsep: fs[i - 1] = oqnsep[0][0] + oqnsep[0][2] boldnamestring = oqnsep[0][1] + ' ' + boldnamestring # take out the initial digits and a dot which we may have just put in # (although sometimes it would have already been there) robj = re.match(r"(\d*\.? )(.*)$", boldnamestring) deci = None if robj: (deci, boldnamestring) = robj.groups() # TODO: do something with deci here (it is the "failed # oral questions" signifier) # see if it is an explicitly bad/ambiguous name which will never match if boldnamestring.find('<broken-name>') >= 0: person_id = 'unknown' boldnamestring = boldnamestring.replace('<broken-name>', '') remadename = ' speakername="%s" error="Name ambiguous in Hansard"' % ( boldnamestring) else: # split bracketed cons out if present brakmatch = re.match("(.*)\s+\((.*)\)", boldnamestring) if brakmatch: (name, cons) = brakmatch.groups() else: (name, cons) = (boldnamestring, None) # match the member to a unique identifier (person_id, remadename, remadecons) = memberList.matchfullnamecons(name, cons, sdate, alwaysmatchcons=False) if person_id and remadename: remadename = ' speakername="%s"' % (remadename) if not person_id: if remadename == "MultipleMatch": if boldnamestring == 'Mr. Michael Foster': if remadecons[0] == 'uk.org.publicwhip/person/10209': person_id = remadecons[0] remadename = ' speakername="Michael Foster"' remadecons = 'Worcester' else: person_id = 'unknown' remadename = ' speakername="%s" error="MultipleMatch"' % boldnamestring elif boldnamestring == 'Jim Dobbin' and sdate == '2014-09-08': person_id = 'uk.org.publicwhip/person/10170' remadename = ' speakername="Jim Dobbin"' else: print " No name,const match (%s,%s)" % (name, cons) raise ContextException("No name match", stamp=stampurl, fragment=boldnamestring) # put record in this place fs[i] = '<speaker person_id="%s"%s>%s</speaker>\n' % \ (person_id.encode("latin-1"), remadename.encode("latin-1"), boldnamestring) # scan through everything and output it into the file fout.writelines(fs)
# print i + 1 matcher = '<a\s*href="(/1/shared/mpdb/html/\d+.stm)" title="Profile of the MP for (.*?)(?: \(.*?\))?"><b>\s*([\s\S]*?)\s*</b></a></td>' matches = re.findall(matcher, content) for match in matches: match = map(lambda x: re.sub("&", "&", x), match) match = map(lambda x: re.sub("\s+", " ", x), match) match = map(lambda x: re.sub("\xa0", "", x), match) match = map(lambda x: x.strip(), match) (url, cons, name) = match # Not in aliases file - see comment there (it's to # avoid ambiguity in debates parsing) if cons == 'Great Yarmouth' and name == 'Tony Wright': name = 'Anthony D Wright' id, canonname, canoncons = memberList.matchfullnamecons( name, cons, date_today) if not id: print >> sys.stderr, "Failed to match %s %s %s" % (name, cons, date_today) continue url = urlparse.urljoin(bbc_index_url, url) pid = memberList.membertoperson(id) if pid in bbcmembers: print >> sys.stderr, "Ignored repeated entry for ", pid else: print '<personinfo id="%s" bbc_profile_url="%s" />' % (pid, url) bbcmembers.add(pid) sys.stdout.flush()
cons = None if file == "thomas_gareth_591.jpg": cons = "Clwyd West" if file == "thomas_gareth_r_592.jpg": cons = "Harrow West" if file == "wright_tony_w_654.jpg": cons = "Cannock Chase" if file == "wright_tony_653.jpg": cons = "Great Yarmouth" last = last.replace("_", " ") fullname = "%s %s" % (first, last) fullname = memberList.fixnamecase(fullname) (id, correctname, correctcons) = memberList.matchfullnamecons(fullname, cons, photodate) id = memberList.membertoperson(id) id = id.replace("uk.org.publicwhip/person/", "") renamemap[file] = "%s.jpg" % id # print file, renamemap[file] assert len( renamemap.keys()) == 659, "got %d keys, not 659" % len(renamemap.keys()) # sys.exit(1) # Do renaming for name, newname in renamemap.iteritems(): assert not os.path.exists(newname), "file %s already exists" % newname
continue lastname = first_col firstname_and_honorific = firstname_from_string(cols[1]) name = '%s %s' % (firstname_and_honorific, lastname) name = name.decode("latin-1", "replace") money = cols[2:28] money = map(lambda x: re.sub("\xa3","", x), money) money = map(lambda x: re.sub(",","", x), money) money = map(lambda x: re.sub(".00$","", x), money) id = None cons = None # other Michael Foster is Michael Jabez Foster if name == 'Mr Michael Foster': cons = 'Worcester' id, found_name, cons = memberList.matchfullnamecons(name, cons, yeardate) if not id: id, found_name, newcons = memberList.matchfullnamecons(name, cons, otheryeardate) if not id: raise Exception, "Failed to find MP in line %s %d" % (line, line_index) pid = memberList.membertoperson(id) # print >>sys.stderr, lastname, firstname_and_honorific, money if id in expmembers: print >>sys.stderr, "Ignored repeated entry for " , id else: fout.write('<personinfo id="%s" ' % pid) expense_cols = ['total_inc_travel', 'total_exc_travel', 'total_travel', '1', '2',
fout.write('''<?xml version="1.0" encoding="ISO-8859-1"?> <publicwhip>\n''') content = csv.reader(open('../rawdata/mpsexpenses200809.txt')) for cols in content: if cols[0] == 'ID': continue # Header #if cols[1] == 'TOTALS': continue # Footer name = cols[0].decode('utf-8') #party = cols[2] #cons = cols[3].decode('utf-8') money = cols[1:] money = map(lambda x: re.sub("\xa3", "", x), money) money = map(lambda x: re.sub(",", "", x), money) id = None cons = None id, found_name, newcons = memberList.matchfullnamecons( name, cons, '2008-05-01') if not id: id, found_name, newcons = memberList.matchfullnamecons( name, cons, '2008-12-01') if not id: raise Exception, "Failed to find MP %s" % name pid = memberList.membertoperson(id) fout.write('<personinfo id="%s" ' % pid) expense_cols = [ '1', '2', '3', '4', 'total_travel', 'stationery', '9', 'comms_allowance' ] total = 0 for i in range(8): col = expense_cols[i] total += float(money[i].strip())
#<td><a href="/wiki/Lyn_Brown" title="Lyn Brown">Lyn Brown</a></td> #<td>Labour</td> matcher = '<tr>\s+<td><a href="/wiki/[^"]+" [^>]*?title="[^"]+">([^<]+)</a>(?:<br />\s+<small>.*?</small>)?\s*</td>\s+(?:<td[^>]*>\s*</td>\s*<td[^>]*><a[^>]*>[^<]*</a>\s*</td>\s*<td[^>]*>\s*</td>\s*)?<td>(?:(?:<span class="sortkey">[^<]*</span>|<span data-sort-value="[^"]*">)<span class="vcard"><span class="fn">)?(?:Dr |Sir |The Rev\. )?<a href="(/wiki/[^"]+)" [^>]*?title="[^"]+"[^>]*>([^<]+)</a>(?:(?:</span>){2,3})?(?: \(.*?\))?\s*</td>|by-election,[^"]+">([^<]+)</a> [^ ]{1,3} <a href="(/wiki/[^"]+)" title="[^"]+">([^<]+)</a>' matches = re.findall(matcher, content) for (cons, url, name, cons2, url2, name2) in matches: id = None if cons2: cons = cons2 name = name2 url = url2 cons = cons.decode('utf-8') cons = cons.replace('&', '&') name = name.decode('utf-8') try: (id, canonname, canoncons) = memberList.matchfullnamecons(name, cons, date_parl[year]) except Exception, e: print >> sys.stderr, e if not id: continue wikimembers[id] = url print '''<?xml version="1.0" encoding="ISO-8859-1"?> <publicwhip>''' k = wikimembers.keys() k.sort() for id in k: url = urlparse.urljoin(wiki_index_url, wikimembers[id]) print '<personinfo id="%s" wikipedia_url="%s" />' % (id, url) print '</publicwhip>'
# Converts triple of (name, constituency, date) into parlparse person id. # Reads lines from standard input, each line having the triple hash-separated. # Outputs the person ids, one per line. import sys import os # Check this out from the ukparse project using Subversion: # svn co https://scm.kforge.net/svn/ukparse/trunk/parlparse os.chdir("../../../../parlparse/pyscraper") sys.path.append(".") import re from resolvemembernames import memberList while 1: sys.stdin.flush() line = sys.stdin.readline() if not line: break line = line.decode("utf-8") name, cons, date_today = line.split("#") id, canonname, canoncons = memberList.matchfullnamecons(name, cons, date_today) if not id: print >>sys.stderr, "failed to match %s (%s) %s" % (name, cons, date_today) person_id = memberList.membertoperson(id) print person_id sys.stdout.flush()
def RunRegmemFilters(fout, text, sdate, sdatever): if sdate >= '2010-09-01': return RunRegmemFilters2010(fout, text, sdate, sdatever) # message for cron so I check I'm using this print "New register of members interests! Check it is working properly (via mpinfoin.pl) - %s" % sdate text = ApplyFixSubstitutions(text, sdate, fixsubs) WriteXMLHeader(fout) fout.write("<publicwhip>\n") text = re.sub('Rt Shaun', 'Shaun', text) # Always get his name wrong text = re.sub('€', '£', text) # Always get some pound signs wrong rows = re.findall("<TR>(.*)</TR>", text) rows = [ re.sub(" ", " ", row) for row in rows ] rows = [ re.sub("<B>|</B>|<BR>|`", "", row) for row in rows ] rows = [ re.sub('<span style="background-color: #FFFF00">|</span>', '', row) for row in rows ] rows = [ re.sub('<IMG SRC="3lev.gif">', "", row) for row in rows ] rows = [ re.sub("­", "-", row) for row in rows ] rows = [ re.sub('\[<A NAME="n\d+"><A HREF="\#note\d+">\d+</A>\]', '', row) for row in rows ] rows = [ re.sub('\[<A NAME="n\d+">\d+\]', '', row) for row in rows ] # Fix incorrect tabling of categories when highlighting is in play rows = [ re.sub('<TD COLSPAN=4>(\d\.) ([^<]*?)</TD>', r'<TD>\1</TD><TD COLSPAN=3>\2</TD>', row) for row in rows ] # split into cells within a row rows = [ re.findall("<TD.*?>\s*(.*?)\s*</TD>", row) for row in rows ] memberset = set() needmemberend = False category = None categoryname = None subcategory = None for row in rows: striprow = re.sub('</?[^>]+>', '', "".join(row)) #print row if striprow.strip() == "": # There is no text on the row, just tags pass elif len(row) == 1 and re.match("(?i)(<i>)? +(</i>)?", row[0]): # <TR><TD COLSPAN=4> </TD></TR> pass elif len(row) == 1: # <TR><TD COLSPAN=4><B>JACKSON, Robert (Wantage)</B></TD></TR> res = re.search("^([^,]*), ([^(]*) \((.*)\)$", row[0]) if not res: print row raise ContextException, "Failed to break up into first/last/cons: %s" % row[0] (lastname, firstname, constituency) = res.groups() constituency = constituency.replace(')', '') constituency = constituency.replace('(', '') firstname = memberList.striptitles(firstname)[0] # Register came out after they stood down if (firstname == 'Ian' and lastname == 'GIBSON' and sdate > '2009-06-08') \ or (firstname == 'Michael' and lastname == 'MARTIN' and sdate > '2009-06-22'): check_date = '2009-06-08' else: check_date = sdate (id, remadename, remadecons) = memberList.matchfullnamecons(firstname + " " + memberList.lowercaselastname(lastname), constituency, check_date) if not id: raise ContextException, "Failed to match name %s %s (%s) date %s" % (firstname, lastname, constituency, sdate) if category: fout.write('\t</category>\n') if needmemberend: fout.write('</regmem>\n') needmemberend = False fout.write(('<regmem personid="%s" membername="%s" date="%s">\n' % (id, remadename, sdate)).encode("latin-1")) memberset.add(id) needmemberend = True category = None categoryname = None subcategory = None elif len(row) == 2 and row[0] == '' and re.match('Nil\.\.?', row[1]): # <TR><TD></TD><TD COLSPAN=3><B>Nil.</B></TD></TR> fout.write('Nil.\n') elif len(row) == 2 and row[0] != '': # <TR><TD><B>1.</B></TD><TD COLSPAN=3><B>Remunerated directorships</B></TD></TR> if category: fout.write('\t</category>\n') digits = row[0] category = re.match("\s*(\d\d?)\.$", digits).group(1) categoryname = row[1] subcategory = None fout.write('\t<category type="%s" name="%s">\n' % (category, categoryname)) elif len(row) == 2 and row[0] == '': # <TR><TD></TD><TD COLSPAN=3><B>Donations to the Office of the Leader of the Liberal Democrats received from:</B></TD></TR> if subcategory: fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, FixHTMLEntities(row[1]))) else: fout.write('\t\t<item>%s</item>\n' % FixHTMLEntities(row[1])) elif len(row) == 3 and row[0] == '' and row[1] == '': # <TR><TD></TD><TD></TD><TD COLSPAN=2>19 and 20 September 2002, two days fishing on the River Tay in Scotland as a guest of Scottish Coal. (Registered 3 October 2002)</TD></TR> if subcategory: fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, FixHTMLEntities(row[2]))) else: fout.write('\t\t<item>%s</item>\n' % FixHTMLEntities(row[2])) elif len(row) == 3 and row[0] == '': # <TR><TD></TD><TD><B>(a)</B></TD><TD COLSPAN=2>Smithville Associates; training consultancy.</TD></TR> if subcategory: fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, FixHTMLEntities(row[1] + ' ' + row[2]))) else: fout.write('\t\t<item>%s</item>\n' % FixHTMLEntities(row[1] + ' ' + row[2])) elif len(row) == 4 and row[0] == '' and (row[1] == '' or row[1] == '<IMG SRC="3lev.gif">'): # <TR><TD></TD><TD></TD><TD>(b)</TD><TD>Great Portland Estates PLC</TD></TR> subcategorymatch = re.match("\(([ab])\)$", row[2]) if not subcategorymatch: content = FixHTMLEntities(row[2] + " " + row[3]) if subcategory: fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, content)) else: fout.write('\t\t<item>%s</item>\n' % content) else: subcategory = subcategorymatch.group(1) fout.write('\t\t(%s)\n' % subcategory) fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, FixHTMLEntities(row[3]))) else: print row raise ContextException, "Unknown row type match, length %d" % (len(row)) if category: fout.write('\t</category>\n') if needmemberend: fout.write('</regmem>\n') needmemberend = False membersetexpect = set([m['person_id'] for m in memberList.mpslistondate(sdate)]) # check for missing/extra entries missing = membersetexpect.difference(memberset) if len(missing) > 0: print "Missing %d MP entries:\n" % len(missing), missing extra = memberset.difference(membersetexpect) if len(extra) > 0: print "Extra %d MP entries:\n" % len(extra), extra fout.write("</publicwhip>\n")
#! /usr/bin/python # -*- coding: utf-8 -*- import sys sys.path.append('lords/') from resolvemembernames import memberList from resolvelordsnames import lordsList print memberList.matchfullnamecons(u"Si\xf4n Simon", "Birmingham Erdington", "2006-01-22") sys.exit(0) print lordsList.GetLordIDfname('Baroness Thatcher', None, '2006-05-01') print lordsList.GetLordIDfname('The Archbishop of York', None, '2006-05-01') print lordsList.GetLordIDfname('The Bishop of Southwell and Nottingham', None, '2006-05-01') print memberList.matchfullnamecons("Anne Moffat", "East Lothian", "2006-01-22") print memberList.matchfullnamecons("Anne Picking", "East Lothian", "2006-01-22") print memberList.matchfullnamecons("Anne Moffat", "East Lothian", "2004-01-22") print memberList.matchfullnamecons("Anne Picking", "East Lothian", "2004-01-22") print memberList.canonicalcons("Aberdeen North", "2001-01-01") print memberList.canonicalcons("Aberdeen North", "2005-05-06") print memberList.matchdebatename("Solicitor-General", None, "2003-11-21") print memberList.matchdebatename("The Advocate-General for Scotland", None, "2004-07-30") print memberList.getmembersoneelection("uk.org.publicwhip/member/1238") print memberList.getmembersoneelection("uk.org.publicwhip/member/1353") print memberList.getmembersoneelection("uk.org.publicwhip/member/1357") print memberList.matchdebatename("Mr. Mackay", None, "2003-11-21")
#<td><a href="/wiki/West_Ham_%28UK_Parliament_constituency%29" title="West Ham (UK Parliament constituency)">West Ham</a></td> #<td><a href="/wiki/Lyn_Brown" title="Lyn Brown">Lyn Brown</a></td> #<td>Labour</td> matcher = '<tr>\s+<td><a href="/wiki/[^"]+" [^>]*?title="[^"]+">([^<]+)</a>(?:<br />\s+<small>.*?</small>)?</td>\s+(?:<td style="[^"]*"></td>\s*<td[^>]*><a[^>]*>[^<]*</a></td>\s*<td style="[^"]*"></td>\s*)?<td>(?:Dr |Sir |The Rev\. )?<a href="(/wiki/[^"]+)" [^>]*?title="[^"]+"[^>]*>([^<]+)</a>(?: \(.*?\))?</td>|by-election,[^"]+">([^<]+)</a> [^ ]{1,3} <a href="(/wiki/[^"]+)" title="[^"]+">([^<]+)</a>'; matches = re.findall(matcher, content) for (cons, url, name, cons2, url2, name2) in matches: id = None if cons2: cons = cons2 name = name2 url = url2 cons = cons.decode('utf-8') cons = cons.replace('&', '&') name = name.decode('utf-8') try: (id, canonname, canoncons) = memberList.matchfullnamecons(name, cons, date_parl[year]) except Exception, e: print >>sys.stderr, e if not id: continue pid = memberList.membertoperson(id) wikimembers[pid] = url print '''<?xml version="1.0" encoding="ISO-8859-1"?> <publicwhip>''' k = wikimembers.keys() k.sort() for id in k: url = urlparse.urljoin(wiki_index_url, wikimembers[id]) print '<personinfo id="%s" wikipedia_url="%s" />' % (id, url) print '</publicwhip>'
(last, first, alienid) = match.groups() cons = None if file == "thomas_gareth_591.jpg": cons = "Clwyd West" if file == "thomas_gareth_r_592.jpg": cons = "Harrow West" if file == "wright_tony_w_654.jpg": cons = "Cannock Chase" if file == "wright_tony_653.jpg": cons = "Great Yarmouth" last = last.replace("_", " ") fullname = "%s %s" % (first, last) fullname = memberList.fixnamecase(fullname) (id, correctname, correctcons) = memberList.matchfullnamecons(fullname, cons, photodate) id = memberList.membertoperson(id) id = id.replace("uk.org.publicwhip/person/", "") renamemap[file] = "%s.jpg" % id # print file, renamemap[file] assert len(renamemap.keys()) == 659, "got %d keys, not 659" % len(renamemap.keys()) # sys.exit(1) # Do renaming for name, newname in renamemap.iteritems(): assert not os.path.exists(newname), "file %s already exists" % newname print name, "=>", newname
def __init__(self, entry): event = entry['{http://services.parliament.uk/ns/calendar/feeds}event'] self.id = event.attrib['id'] self.deleted = 0 self.link_calendar = entry.guid self.link_external = entry.link chamber = event.chamber.text.strip() self.chamber = '%s: %s' % (event.house.text.strip(), chamber) self.event_date = event.date.text self.time_start = getattr(event, 'startTime', None) self.time_end = getattr(event, 'endTime', None) committee_text = event.comittee.text if committee_text: committee_text = committee_text.strip() if chamber in ('Select Committee', 'General Committee'): self.committee_name = committee_text elif committee_text != "Prime Minister's Question Time": self.debate_type = committee_text self.people = [] title_text = event.inquiry.text if title_text: m = re.search(' - ([^-]*)$', title_text) if m: person_texts = [x.strip() for x in m.group(1).split('/')] for person_text in person_texts: id, name, cons = memberList.matchfullnamecons( person_text, None, self.event_date) if not id: try: id = lordsList.GetLordIDfname( person_text, None, self.event_date) except: pass if id: self.people.append( int(id.replace('uk.org.publicwhip/person/', ''))) if len(self.people) == len(person_texts): title_text = title_text.replace(' - ' + m.group(1), '') self.title = title_text.strip() elif committee_text == "Prime Minister's Question Time": self.title = committee_text self.witnesses = [] witness_text = event.witnesses.text if witness_text == 'This is a private meeting.': self.title = witness_text elif witness_text: self.witnesses_str = witness_text.strip() m = re.findall(r'\b(\w+ \w+ MP)', self.witnesses_str) for mp in m: id, name, cons = memberList.matchfullnamecons( mp, None, self.event_date) if not id: continue pid = int(id.replace('uk.org.publicwhip/person/', '')) mp_link = '<a href="/mp/?p=%d">%s</a>' % (pid, mp) self.witnesses.append(pid) self.witnesses_str = self.witnesses_str.replace(mp, mp_link) location_text = event.location.text if location_text: self.location = location_text.strip()
continue lastname = first_col firstname_and_honorific = firstname_from_string(cols[1]) name = '%s %s' % (firstname_and_honorific, lastname) name = name.decode("latin-1", "replace") money = cols[2:28] money = map(lambda x: re.sub("\xa3", "", x), money) money = map(lambda x: re.sub(",", "", x), money) money = map(lambda x: re.sub(".00$", "", x), money) id = None cons = None # other Michael Foster is Michael Jabez Foster if name == 'Mr Michael Foster': cons = 'Worcester' id, found_name, cons = memberList.matchfullnamecons(name, cons, yeardate) if not id: id, found_name, newcons = memberList.matchfullnamecons( name, cons, otheryeardate) if not id: raise Exception, "Failed to find MP in line %s %d" % (line, line_index) pid = memberList.membertoperson(id) # print >>sys.stderr, lastname, firstname_and_honorific, money if id in expmembers: print >> sys.stderr, "Ignored repeated entry for ", id else: fout.write('<personinfo id="%s" ' % pid) expense_cols = [ 'total_inc_travel', 'total_exc_travel', 'total_travel', '1', '2', '3', '4', '7', '7a', '8', '9', 'comms_allowance', 'mp_reg_travel_a', 'mp_reg_travel_b', 'mp_reg_travel_c',
def FilterWransSpeakers(fout, text, sdate): text = ApplyFixSubstitutions(text, sdate, fixsubs) # Fix things like this, to put bold in. We use bold below to detect names, but # occasionally the reporters miss it out, and we catch such cases here: # <p><a name="qnpa_0">Caroline Flint: This information is not held centrally. </p> # <p><a name="qnpa_15">Ms Harman: The information can be found in the following table. </p> missingbolds = re.findall('(\n?<p>(?:<stamp aname="[^"]+"/>)+)((?:<b></b>)?\s*)([A-Za-z.\-\s]+)(:\s)', text) for p1,p2,p3,p4 in missingbolds: missingbold = "%s%s%s%s" % (p1,p2,p3,p4) bold = "%s<b>%s%s</b>" % (p1,p3,p4) namematches = memberList.fullnametoids(p3, sdate) # Only fix if we found a matching name in the middle (and do it even if ambiguous) if namematches: #print "Fixing missing bold, had name matches:\n\t%s\n\t%s" % (missingbold.strip(), bold.strip()) if not missingbold in text: print "ERROR: missing bold text found, but then vanished when replacing" text = text.replace(missingbold, bold) #else: #print "Plausible missing bold not fixed, as no name matches:\n\t%s\n\t%s" % (missingbold.strip(), bold.strip()) # <B> Mrs. Iris Robinson: </B> lspeakerregexp = '<b>.*?</b>(?:\s*:)?' ltableregexp = '<table[^>]*>[\s\S]*?</table>' # these have bolds, so must be separated out tableregexp = ltableregexp + '(?i)' lregexp = '(%s|%s)(?i)' % (ltableregexp, lspeakerregexp) # setup for scanning through the file. fs = re.split(lregexp, text) # for error messages stampurl = StampUrl(sdate) for i in range(len(fs)): fss = fs[i] fss = stampurl.UpdateStampUrl(fss) # Speakers have new stamps in them if re.match(tableregexp, fss): continue speakerg = re.findall('<b>\s*([^:]*)[:\s]*?([^<:]*)</b>(?i)', fss) if not speakerg: continue # we have a string in bold boldnamestring = string.strip(speakerg[0][0]) # trailing text after the colon in the bold speech bit if re.search('\S', speakerg[0][1]): fs[i+1] = speakerg[0][1] + fs[i+1] # push the square brackets outside of the boldstring if there is one # <B> Mr. Miliband [ </B> <i>holding answer 24 March</i>]: sqb = re.findall('^([^\[]*)(\[.*)$', boldnamestring) if sqb: boldnamestring = string.strip(sqb[0][0]) fs[i+1] = sqb[0][1] + fs[i+1] # get rid of blank bold strings if not re.search('\S', boldnamestring): fs[i] = '' continue # try to pull in the question number if preceeding # These signify aborted oral questions, and are normally # useless and at the start of the page. # 27. <B> Mr. Steen: </B> if i > 0: oqnsep = re.findall('^([\s\S]*?)Q?(\d+\.?)(\s*?(?:<stamp aname=".*?"/>)?)$', fs[i-1]) if oqnsep: fs[i-1] = oqnsep[0][0] + oqnsep[0][2] boldnamestring = oqnsep[0][1] + ' ' + boldnamestring # take out the initial digits and a dot which we may have just put in # (although sometimes it would have already been there) robj = re.match(r"(\d*\.? )(.*)$", boldnamestring) deci = None if robj: (deci, boldnamestring) = robj.groups() # TODO: do something with deci here (it is the "failed # oral questions" signifier) # see if it is an explicitly bad/ambiguous name which will never match if boldnamestring.find('<broken-name>') >= 0: id = 'unknown' boldnamestring = boldnamestring.replace('<broken-name>', '') remadename = ' speakername="%s" error="Name ambiguous in Hansard"' % (boldnamestring) else: # split bracketed cons out if present brakmatch = re.match("(.*)\s+\((.*)\)", boldnamestring) if brakmatch: (name, cons) = brakmatch.groups() else: (name, cons) = (boldnamestring, None) # match the member to a unique identifier (id, remadename, remadecons) = memberList.matchfullnamecons(name, cons, sdate, alwaysmatchcons = False) if id and remadename: remadename = ' speakername="%s"' % (remadename) if not id: if remadename == "MultipleMatch": if boldnamestring == 'Mr. Michael Foster': if remadecons[1] == 'uk.org.publicwhip/member/1939': id = remadecons[1] remadename = ' speakername="Michael Foster"' remadecons = 'Worcester' elif remadecons[0] == 'uk.org.publicwhip/member/896': id = remadecons[0] remadename = ' speakername="Michael Foster"' remadecons = 'Worcester' else: id = 'unknown' remadename = ' speakername="%s" error="MultipleMatch"' % boldnamestring elif boldnamestring == 'Jim Dobbin' and sdate == '2014-09-08': id = 'uk.org.publicwhip/member/40316' remadename = ' speakername="Jim Dobbin"' else: print " No name,const match (%s,%s)" % (name, cons) raise ContextException("No name match", stamp=stampurl, fragment=boldnamestring) # put record in this place fs[i] = '<speaker speakerid="%s"%s>%s</speaker>\n' % \ (id.encode("latin-1"), remadename.encode("latin-1"), boldnamestring) # scan through everything and output it into the file fout.writelines(fs)
for line in content: line = line.strip() if not line or re.match('#', line): continue cols = line.split("\t") name = cols[0] m = re.match('(.*?), (.*)$', name) name = '%s %s' % (m.group(2), m.group(1)) money = cols[1:16] money = map(lambda x: re.sub("\xa3","", x), money) money = map(lambda x: re.sub(",","", x), money) id = None cons = None if name == 'Mr Michael Foster': cons = 'Worcester' id, name, cons = memberList.matchfullnamecons(name, cons, yeardate) #if not id: # id, name, newcons = memberList.matchfullnamecons(first + ' ' + last, cons, otheryeardate) if not id: raise Exception, "Failed to find MP in line %s" % line pid = memberList.membertoperson(id) # print >>sys.stderr, last, first, money if id in expmembers: print >>sys.stderr, "Ignored repeated entry for " , id else: fout.write('<personinfo id="%s" ' % pid) for i in [ 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 ]: if i==0 or i==1 or i==2 or i==3: col = i + 1 elif i==4: col = '5a' elif i==5: col = '5b'
# if re.search("Not Found(?i)", content): # raise Exception, "Failed to get content in url %s" % test_url # matcher = '<TD ALIGN="LEFT" VALIGN="TOP"><A HREF="(/weblink/html/member.html/.*)/log=\d+/pos=\d+" TARGET="_parent"><font face="arial,helvetica" size=2>(.*)/(.*)</A></TD>\s*<TD ALIGN="LEFT" VALIGN="TOP"><font face="arial,helvetica" size=2>(.*)</TD>' # matches = re.findall(matcher, content) for line in content: cols = line.split("\t") first = cols[0] last = cols[1] cons = cols[2] money = cols[3:] money = map(lambda x: re.sub("\xa3","", x), money) money = map(lambda x: re.sub(",","", x), money) id, name, cons = memberList.matchfullnamecons(first + " " + last, cons, yeardate) if not id: raise Exception, "Failed to find MP %s %s" % (first, last) pid = memberList.membertoperson(id) # print >>sys.stderr, last, first, money if pid in expmembers: print >>sys.stderr, "Ignored repeated entry for " , pid else: fout.write('<personinfo id="%s" ' % pid) for i in [ 0,1,2,3,4,5,6,7,8,9 ]: if (year=='2004'): if (i==7): col = '7a' elif (i==8 or i==9): col = i
file.close() for line in content: cols = line.split("\t") cons = cols[0] money = cols[1:11] first = '' last = '' if (len(cols)>11): last = cols[11] first = cols[12] money = map(lambda x: re.sub("\xa3","", x), money) money = map(lambda x: re.sub(",","", x), money) id = None if first and last: id, name, newcons = memberList.matchfullnamecons(first + ' ' + last, cons, yeardate) if not id: id, name, newcons = memberList.matchfullnamecons(first + ' ' + last, cons, otheryeardate) cons = newcons if not id: id, name, cons = memberList.matchcons(cons, yeardate) if not id: raise Exception, "Failed to find MP in line %s" % line pid = memberList.membertoperson(id) # print >>sys.stderr, last, first, money if id in expmembers: print >>sys.stderr, "Ignored repeated entry for " , id else: fout.write('<personinfo id="%s" ' % pid) for i in [ 0,1,2,3,4,5,6,7,8,9 ]: if (i==7):
def MpList(fsm, vote, stampurl, sdate): # Merge lone listed constituencies onto end of previous line newfsm = [] for fss in fsm: if not fss: continue if reconstnm.match(fss): # print "constnm only %s appending to previous line %s" % (fss, newfsm[-1]) newfsm[-1] += " " + fss else: newfsm.append(fss) res = [] pfss = '' multimatches = {} # from tuple to number of matches accounted, and name for fss in newfsm: #print "fss ", fss # break up concattenated lines # Beresford, Sir PaulBlunt, Crispin while re.search('\S', fss): # there was an & in [A-Z] on line below, but it broke up this incorrectly: # Simon, Siôn <i>(B'ham Erdington)</i> regsep = re.search('(.*?,.*?(?:[a-z]|</i>|\.|\)))([A-Z].*?,.*)$', fss) regsep2 = re.match('(.*?,.*?) ([A-Z].*?,.*)$', fss) if regsep and not re.search(' Mc$', regsep.group(1)): fssf = regsep.group(1) fss = regsep.group(2) elif regsep2: fssf = regsep2.group(1) fss = regsep2.group(2) else: fssf = fss fss = '' # check alphabetical - but "rh" and so on confound so don't bother #if pfss and (pfss > fssf): # print pfss, fssf # raise Exception, ' out of alphabetical order %s and %s' % (pfss, fssf) #pfss = fssf # flipround the name # Bradley, rh Keith <i>(Withington)</i> # Simon, Sio(r)n <i>(Withington)</i> #print "fssf ", fssf ginp = reflipname.match(fssf) if ginp: #print "grps ", ginp.groups() fnam = '%s %s' % (ginp.group(2), ginp.group(1)) cons = ginp.group(3) # name not being flipped, is firstname lastname else: ginp = renoflipname.match(fssf) if not ginp: raise ContextException( "No flipped or non-flipped name match (division)", stamp=stampurl, fragment=fssf) fnam = ginp.group(1) cons = ginp.group(2) #print "fss ", fssf (mpid, remadename, remadecons) = memberList.matchfullnamecons(fnam, cons, sdate, alwaysmatchcons=False) if not mpid and remadename == "MultipleMatch": assert type(remadecons) == tuple # actually the list of ids i = len(multimatches.setdefault(remadecons, [])) # the index we work with if i >= len(remadecons): print "Name", fnam, "used too many times for list", remadecons, "where other instances are", multimatches[ remadecons] raise ContextException("Too many instances", stamp=stampurl, fragment=fnam) mpid = remadecons[i] multimatches[remadecons].append(fnam) # appears with multiple matching which is ignorable when both ambiguous people vote on same side of a division #print "For name", fnam, "returning id", mpid, ";", i, " out of ", remadecons elif not mpid and remadename != "MultipleMatch": print "division.py: no match for", fnam, cons, sdate raise ContextException("No match on name", stamp=stampurl, fragment=fnam) #print fnam, " --> ", remadename.encode("latin-1") res.append('\t<mpname person_id="%s" vote="%s">%s</mpname>' % (mpid, vote, FixHTMLEntities(fssf))) # now we have to check if the multimatched names were all exhausted for ids in multimatches: if len(multimatches[ids]) != len(ids): print "Insufficient vote matches on name", multimatches[ ids], "ids taken to", ids raise ContextException("Not enough vote match on ambiguous name", stamp=stampurl, fragment=multimatches[ids][0]) return res
origcons = origcons.replace("Stretford and ~~~~~~~", "Stretford and Urmston") # no longer in house - TODO give better date if origname == "Dennis Canavan" or origname == "Rt Hon Paul Daisley": continue if origcons == "South Tomshire": # better keep rosa's membership of parliament secret continue if origcons == "Trumpton": # i didn't know james was religious continue if origcons == "Stefstown": # i didn't know stef was knighted continue try: id, name, cons = memberList.matchfullnamecons(origname, origcons, date_today) except Exception, e: print >> sys.stderr, "FaxYourMP name match failed" print >> sys.stderr, e else: if voteside.lower() == "no": nohash[id] = nohash.get(id, 0) + int(votecount) elif voteside.lower() == "yes" or voteside.lower() == "yes" + chr(160): yeshash[id] = yeshash.get(id, 0) + int(votecount) else: raise Exception, "Strange vote %s" % voteside ih.close() def responsiveness(id):
def RunRegmemFilters2010(fout, text, sdate, sdatever): print "2010-? new register of members interests! Check it is working properly (via mpinfoin.pl) - %s" % sdate WriteXMLHeader(fout) fout.write("<publicwhip>\n") memberset = set() text = re.sub('<span class="highlight">([^<]*?)</span>', r'\1', text) t = BeautifulStoneSoup(text) for page in t('page'): title = page.h2.renderContents() if title in ('HAGUE, Rt Hon William (Richmond (Yorks)', 'PEARCE, Teresa (Erith and Thamesmead'): title += ')' res = re.search("^([^,]*), ([^(]*) \((.*)\)\s*$", title) if not res: raise ContextException, "Failed to break up into first/last/cons: %s" % title (lastname, firstname, constituency) = res.groups() firstname = memberList.striptitles(firstname)[0].decode('utf-8') lastname = memberList.lowercaselastname(lastname).decode('utf-8') constituency = constituency.decode('utf-8') lastname = lastname.replace(u'O\u2019brien', "O'Brien") # Hmm (id, remadename, remadecons) = memberList.matchfullnamecons(firstname + " " + lastname, constituency, sdate) if not id: raise ContextException, "Failed to match name %s %s (%s) date %s\n" % ( firstname, lastname, constituency, sdate) fout.write(('<regmem personid="%s" membername="%s" date="%s">\n' % (id, remadename, sdate)).encode("latin-1")) memberset.add(id) category = None categoryname = None subcategory = None record = False for row in page.h2.findNextSiblings(): text = row.renderContents().decode('utf-8').encode( 'iso-8859-1', 'xmlcharrefreplace') if row.get('class') == 'spacer': if record: fout.write('\t\t</record>\n') record = False continue if not text or re.match('\s*\.\s*$', text): continue if text == '<strong>%s</strong>' % title: continue if re.match('\s*Nil\.?\s*$', text): fout.write('Nil.\n') continue # Since 2015 election, register is all paragraphs, no headings :( if row.name == 'h3' or row.get('class') == 'shd0' or re.match( '<strong>\d+\. ', text): if re.match('\s*$', text): continue m = re.match( "(?:\s*<strong>)?\s*(\d\d?)\.\s*(.*)(?:</strong>\s*)?$", text) if m: if record: fout.write('\t\t</record>\n') record = False if category: fout.write('\t</category>\n') category, categoryname = m.groups() subcategory = None categoryname = re.sub('<[^>]*>(?s)', '', categoryname).strip() fout.write('\t<category type="%s" name="%s">\n' % (category, categoryname)) continue if not record: fout.write('\t\t<record>\n') record = True subcategorymatch = re.match("\s*\(([ab])\)\s*(.*)$", text) if subcategorymatch: subcategory = subcategorymatch.group(1) fout.write('\t\t\t(%s)\n' % subcategory) fout.write('\t\t\t<item subcategory="%s">%s</item>\n' % (subcategory, subcategorymatch.group(2))) continue if subcategory: fout.write('\t\t\t<item subcategory="%s">%s</item>\n' % (subcategory, text)) else: fout.write('\t\t\t<item>%s</item>\n' % text) if record: fout.write('\t\t</record>\n') record = False if category: fout.write('\t</category>\n') fout.write('</regmem>\n') membersetexpect = set( [m['person_id'] for m in memberList.mpslistondate(sdate)]) # check for missing/extra entries missing = membersetexpect.difference(memberset) if len(missing) > 0: print "Missing %d MP entries:\n" % len(missing), missing extra = memberset.difference(membersetexpect) if len(extra) > 0: print "Extra %d MP entries:\n" % len(extra), extra fout.write("</publicwhip>\n")
def RunRegmemFilters2010(fout, text, sdate, sdatever): print "2010-? new register of members interests! Check it is working properly (via mpinfoin.pl) - %s" % sdate WriteXMLHeader(fout) fout.write("<publicwhip>\n") memberset = set() text = re.sub('<span class="highlight">([^<]*?)</span>', r'\1', text) t = BeautifulStoneSoup(text) for page in t('page'): title = page.h2.renderContents() res = re.search("^([^,]*), ([^(]*) \((.*)\)\s*$", title) if not res: raise ContextException, "Failed to break up into first/last/cons: %s" % title (lastname, firstname, constituency) = res.groups() firstname = memberList.striptitles(firstname)[0].decode('utf-8') lastname = memberList.lowercaselastname(lastname).decode('utf-8') constituency = constituency.decode('utf-8') lastname = lastname.replace(u'O\u2019brien', "O'Brien") # Hmm (id, remadename, remadecons) = memberList.matchfullnamecons(firstname + " " + lastname, constituency, sdate) if not id: raise ContextException, "Failed to match name %s %s (%s) date %s\n" % (firstname, lastname, constituency, sdate) fout.write(('<regmem personid="%s" memberid="%s" membername="%s" date="%s">\n' % (memberList.membertoperson(id), id, remadename, sdate)).encode("latin-1")) memberset.add(id) category = None categoryname = None subcategory = None for row in page.h2.findNextSiblings(): text = row.renderContents().decode('utf-8').encode('iso-8859-1', 'xmlcharrefreplace') if not text or re.match('\s*\.\s*$', text): continue if re.match('\s*Nil\.?\s*$', text): fout.write('Nil.\n') continue if row.name == 'h3': if re.match('\s*$', text): continue m = re.match("\s*(\d\d?)\.\s*(.*)$", text) if m: if category: fout.write('\t</category>\n') category, categoryname = m.groups() subcategory = None fout.write('\t<category type="%s" name="%s">\n' % (category, categoryname)) continue if row.get('class') == 'spacer': continue subcategorymatch = re.match("\s*\(([ab])\)\s*(.*)$", text) if subcategorymatch: subcategory = subcategorymatch.group(1) fout.write('\t\t(%s)\n' % subcategory) fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, subcategorymatch.group(2))) continue if subcategory: fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, text)) else: fout.write('\t\t<item>%s</item>\n' % text) if category: fout.write('\t</category>\n') fout.write('</regmem>\n') membersetexpect = set(memberList.mpslistondate(sdate)) # check for missing/extra entries missing = membersetexpect.difference(memberset) if len(missing) > 0: print "Missing %d MP entries:\n" % len(missing), missing extra = memberset.difference(membersetexpect) if len(extra) > 0: print "Extra %d MP entries:\n" % len(extra), extra fout.write("</publicwhip>\n")
def __init__(self, entry): self.id = entry.event.attrib['id'] self.deleted = 0 self.link_calendar = entry.guid self.link_external = entry.link chamber = entry.event.chamber.text.strip() self.chamber = '%s: %s' % (entry.event.house.text.strip(), chamber) self.event_date = entry.event.date.text self.time_start = getattr(entry.event, 'startTime', None) self.time_end = getattr(entry.event, 'endTime', None) committee_text = entry.event.comittee.text if committee_text: committee_text = committee_text.strip() if chamber in ('Select Committee', 'General Committee'): self.committee_name = committee_text elif committee_text != "Prime Minister's Question Time": self.debate_type = committee_text self.people = [] title_text = entry.event.inquiry.text if title_text: m = re.search(' - ([^-]*)$', title_text) if m: person_texts = [x.strip() for x in m.group(1).split('/')] for person_text in person_texts: id, name, cons = memberList.matchfullnamecons(person_text, None, self.event_date) if not id: try: id = lordsList.GetLordIDfname(person_text, None, self.event_date) except: pass if id: self.people.append(int(memberList.membertoperson(id).replace('uk.org.publicwhip/person/', ''))) if len(self.people) == len(person_texts): title_text = title_text.replace(' - ' + m.group(1), '') self.title = title_text.strip() elif committee_text == "Prime Minister's Question Time": self.title = committee_text self.witnesses = [] witness_text = entry.event.witnesses.text if witness_text == 'This is a private meeting.': self.title = witness_text elif witness_text: self.witnesses_str = witness_text.strip() m = re.findall(r'\b(\w+ \w+ MP)', self.witnesses_str) for mp in m: id, name, cons = memberList.matchfullnamecons(mp, None, self.event_date) if not id: continue pid = int(memberList.membertoperson(id).replace('uk.org.publicwhip/person/', '')) mp_link = '<a href="/mp/?p=%d">%s</a>' % (pid, mp) self.witnesses.append(pid) self.witnesses_str = self.witnesses_str.replace(mp, mp_link) location_text = entry.event.location.text if location_text: self.location = location_text.strip()
if made[0:4] != "2004": continue made_date = mx.DateTime.DateTimeFrom(made).date constituency = constituency.replace('\\', '') mp_name = mp_name.replace('\\', '') if constituency == "South Tomshire": # better keep rosa's membership of parliament secret continue if constituency == "Trumpton": # i didn't know james was religious continue if constituency == "Stefstown": # i didn't know stef was knighted continue try: mp_id, name, cons = memberList.matchfullnamecons(mp_name, constituency, made_date) except Exception, e: print >>sys.stderr, "FaxYourMP name match failed", e else: if not mp_id: print >>sys.stderr, "FaxYourMP name match failed %s, %s" % (mp_name, constituency) else: id = memberList.membertoperson(mp_id) if vote.lower() == "no": nohash[id] = nohash.get(id, 0) + 1 elif vote.lower() == "yes" or vote.lower() == "yes"+chr(160): yeshash[id] = yeshash.get(id, 0) + 1 elif vote == "": # print >>sys.stderr, "Blank vote" # Ignore for now pass