def __init__(self, entry): self.id = entry.event.attrib['id'] self.deleted = 0 self.link_calendar = entry.guid self.link_external = entry.link chamber = entry.event.chamber.text.strip() self.chamber = '%s: %s' % (entry.event.house.text.strip(), chamber) self.event_date = entry.event.date.text self.time_start = getattr(entry.event, 'startTime', None) self.time_end = getattr(entry.event, 'endTime', None) committee_text = entry.event.comittee.text if committee_text: committee_text = committee_text.strip() if chamber in ('Select Committee', 'General Committee'): self.committee_name = committee_text elif committee_text != "Prime Minister's Question Time": self.debate_type = committee_text self.people = [] title_text = entry.event.inquiry.text if title_text: m = re.search(' - ([^-]*)$', title_text) if m: person_texts = [x.strip() for x in m.group(1).split('/')] for person_text in person_texts: id, name, cons = memberList.matchfullnamecons(person_text, None, self.event_date) if not id: try: id = lordsList.GetLordIDfname(person_text, None, self.event_date) except: pass if id: self.people.append(int(memberList.membertoperson(id).replace('uk.org.publicwhip/person/', ''))) if len(self.people) == len(person_texts): title_text = title_text.replace(' - ' + m.group(1), '') self.title = title_text.strip() elif committee_text == "Prime Minister's Question Time": self.title = committee_text self.witnesses = [] witness_text = entry.event.witnesses.text if witness_text == 'This is a private meeting.': self.title = witness_text elif witness_text: self.witnesses_str = witness_text.strip() m = re.findall(r'\b(\w+ \w+ MP)', self.witnesses_str) for mp in m: id, name, cons = memberList.matchfullnamecons(mp, None, self.event_date) if not id: continue pid = int(memberList.membertoperson(id).replace('uk.org.publicwhip/person/', '')) mp_link = '<a href="/mp/?p=%d">%s</a>' % (pid, mp) self.witnesses.append(pid) self.witnesses_str = self.witnesses_str.replace(mp, mp_link) location_text = entry.event.location.text if location_text: self.location = location_text.strip()
# matcher = '<TD ALIGN="LEFT" VALIGN="TOP"><A HREF="(/weblink/html/member.html/.*)/log=\d+/pos=\d+" TARGET="_parent"><font face="arial,helvetica" size=2>(.*)/(.*)</A></TD>\s*<TD ALIGN="LEFT" VALIGN="TOP"><font face="arial,helvetica" size=2>(.*)</TD>' # matches = re.findall(matcher, content) for line in content: cols = line.split("\t") first = cols[0] last = cols[1] cons = cols[2] money = cols[3:] money = map(lambda x: re.sub("\xa3","", x), money) money = map(lambda x: re.sub(",","", x), money) id, name, cons = memberList.matchfullnamecons(first + " " + last, cons, yeardate) if not id: raise Exception, "Failed to find MP %s %s" % (first, last) pid = memberList.membertoperson(id) # print >>sys.stderr, last, first, money if pid in expmembers: print >>sys.stderr, "Ignored repeated entry for " , pid else: fout.write('<personinfo id="%s" ' % pid) for i in [ 0,1,2,3,4,5,6,7,8,9 ]: if (year=='2004'): if (i==7): col = '7a' elif (i==8 or i==9): col = i else: col = i+1 else: if (i<9):
'03': '2003', '02': '2002', '01': '2001', '00': '2000', '99': '1999', '98': '1998', '97': '1997' } signers = {} edms = {} sigs = {} primary = {} session = sys.argv[1] for memberurl in edmList.edmlookups: pid = memberList.membertoperson(edmList.lookup(memberurl)) m = re.search('=(.*?)SlAsHcOdEsTrInG(.*)', memberurl) lastname = urllib.unquote(m.group(1)) firstname = urllib.unquote(m.group(2)) pnum = int(re.sub('uk.org.publicwhip/person/', '', pid)) # print >> sys.stderr, "Member:%s, ID:%s, session:%s" % (memberurl,pid,sessions[session]) content = get_member(memberurl, pnum, session) if re.search('no EDMs', content): continue for fix in fixes: content = re.sub(fix[0], fix[1], content) m = re.search('ound (\d+) EDMs? signed', content) total = int(m.group(1)) matches = re.findall(matcher, content) count = 0 for (type, ref, url, title, num, day, month, year) in matches:
def RunRegmemFilters2010(fout, text, sdate, sdatever): print "2010-? new register of members interests! Check it is working properly (via mpinfoin.pl) - %s" % sdate WriteXMLHeader(fout) fout.write("<publicwhip>\n") memberset = set() text = re.sub('<span class="highlight">([^<]*?)</span>', r'\1', text) t = BeautifulStoneSoup(text) for page in t('page'): title = page.h2.renderContents() res = re.search("^([^,]*), ([^(]*) \((.*)\)\s*$", title) if not res: raise ContextException, "Failed to break up into first/last/cons: %s" % title (lastname, firstname, constituency) = res.groups() firstname = memberList.striptitles(firstname)[0].decode('utf-8') lastname = memberList.lowercaselastname(lastname).decode('utf-8') constituency = constituency.decode('utf-8') lastname = lastname.replace(u'O\u2019brien', "O'Brien") # Hmm (id, remadename, remadecons) = memberList.matchfullnamecons(firstname + " " + lastname, constituency, sdate) if not id: raise ContextException, "Failed to match name %s %s (%s) date %s\n" % (firstname, lastname, constituency, sdate) fout.write(('<regmem personid="%s" memberid="%s" membername="%s" date="%s">\n' % (memberList.membertoperson(id), id, remadename, sdate)).encode("latin-1")) memberset.add(id) category = None categoryname = None subcategory = None for row in page.h2.findNextSiblings(): text = row.renderContents().decode('utf-8').encode('iso-8859-1', 'xmlcharrefreplace') if not text or re.match('\s*\.\s*$', text): continue if re.match('\s*Nil\.?\s*$', text): fout.write('Nil.\n') continue if row.name == 'h3': if re.match('\s*$', text): continue m = re.match("\s*(\d\d?)\.\s*(.*)$", text) if m: if category: fout.write('\t</category>\n') category, categoryname = m.groups() subcategory = None fout.write('\t<category type="%s" name="%s">\n' % (category, categoryname)) continue if row.get('class') == 'spacer': continue subcategorymatch = re.match("\s*\(([ab])\)\s*(.*)$", text) if subcategorymatch: subcategory = subcategorymatch.group(1) fout.write('\t\t(%s)\n' % subcategory) fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, subcategorymatch.group(2))) continue if subcategory: fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, text)) else: fout.write('\t\t<item>%s</item>\n' % text) if category: fout.write('\t</category>\n') fout.write('</regmem>\n') membersetexpect = set(memberList.mpslistondate(sdate)) # check for missing/extra entries missing = membersetexpect.difference(memberset) if len(missing) > 0: print "Missing %d MP entries:\n" % len(missing), missing extra = memberset.difference(membersetexpect) if len(extra) > 0: print "Extra %d MP entries:\n" % len(extra), extra fout.write("</publicwhip>\n")
def RunRegmemFilters(fout, text, sdate, sdatever): if sdate >= '2010-09-01': return RunRegmemFilters2010(fout, text, sdate, sdatever) # message for cron so I check I'm using this print "New register of members interests! Check it is working properly (via mpinfoin.pl) - %s" % sdate text = ApplyFixSubstitutions(text, sdate, fixsubs) WriteXMLHeader(fout) fout.write("<publicwhip>\n") text = re.sub('Rt Shaun', 'Shaun', text) # Always get his name wrong text = re.sub('€', '£', text) # Always get some pound signs wrong rows = re.findall("<TR>(.*)</TR>", text) rows = [ re.sub(" ", " ", row) for row in rows ] rows = [ re.sub("<B>|</B>|<BR>|`", "", row) for row in rows ] rows = [ re.sub('<span style="background-color: #FFFF00">|</span>', '', row) for row in rows ] rows = [ re.sub('<IMG SRC="3lev.gif">', "", row) for row in rows ] rows = [ re.sub("­", "-", row) for row in rows ] rows = [ re.sub('\[<A NAME="n\d+"><A HREF="\#note\d+">\d+</A>\]', '', row) for row in rows ] rows = [ re.sub('\[<A NAME="n\d+">\d+\]', '', row) for row in rows ] # Fix incorrect tabling of categories when highlighting is in play rows = [ re.sub('<TD COLSPAN=4>(\d\.) ([^<]*?)</TD>', r'<TD>\1</TD><TD COLSPAN=3>\2</TD>', row) for row in rows ] # split into cells within a row rows = [ re.findall("<TD.*?>\s*(.*?)\s*</TD>", row) for row in rows ] memberset = set() needmemberend = False category = None categoryname = None subcategory = None for row in rows: striprow = re.sub('</?[^>]+>', '', "".join(row)) #print row if striprow.strip() == "": # There is no text on the row, just tags pass elif len(row) == 1 and re.match("(?i)(<i>)? +(</i>)?", row[0]): # <TR><TD COLSPAN=4> </TD></TR> pass elif len(row) == 1: # <TR><TD COLSPAN=4><B>JACKSON, Robert (Wantage)</B></TD></TR> res = re.search("^([^,]*), ([^(]*) \((.*)\)$", row[0]) if not res: print row raise ContextException, "Failed to break up into first/last/cons: %s" % row[0] (lastname, firstname, constituency) = res.groups() constituency = constituency.replace(')', '') constituency = constituency.replace('(', '') firstname = memberList.striptitles(firstname)[0] # Register came out after they stood down if (firstname == 'Ian' and lastname == 'GIBSON' and sdate > '2009-06-08') \ or (firstname == 'Michael' and lastname == 'MARTIN' and sdate > '2009-06-22'): check_date = '2009-06-08' else: check_date = sdate (id, remadename, remadecons) = memberList.matchfullnamecons(firstname + " " + memberList.lowercaselastname(lastname), constituency, check_date) if not id: raise ContextException, "Failed to match name %s %s (%s) date %s" % (firstname, lastname, constituency, sdate) if category: fout.write('\t</category>\n') if needmemberend: fout.write('</regmem>\n') needmemberend = False fout.write(('<regmem personid="%s" memberid="%s" membername="%s" date="%s">\n' % (memberList.membertoperson(id), id, remadename, sdate)).encode("latin-1")) memberset.add(id) needmemberend = True category = None categoryname = None subcategory = None elif len(row) == 2 and row[0] == '' and re.match('Nil\.\.?', row[1]): # <TR><TD></TD><TD COLSPAN=3><B>Nil.</B></TD></TR> fout.write('Nil.\n') elif len(row) == 2 and row[0] != '': # <TR><TD><B>1.</B></TD><TD COLSPAN=3><B>Remunerated directorships</B></TD></TR> if category: fout.write('\t</category>\n') digits = row[0] category = re.match("\s*(\d\d?)\.$", digits).group(1) categoryname = row[1] subcategory = None fout.write('\t<category type="%s" name="%s">\n' % (category, categoryname)) elif len(row) == 2 and row[0] == '': # <TR><TD></TD><TD COLSPAN=3><B>Donations to the Office of the Leader of the Liberal Democrats received from:</B></TD></TR> if subcategory: fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, FixHTMLEntities(row[1]))) else: fout.write('\t\t<item>%s</item>\n' % FixHTMLEntities(row[1])) elif len(row) == 3 and row[0] == '' and row[1] == '': # <TR><TD></TD><TD></TD><TD COLSPAN=2>19 and 20 September 2002, two days fishing on the River Tay in Scotland as a guest of Scottish Coal. (Registered 3 October 2002)</TD></TR> if subcategory: fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, FixHTMLEntities(row[2]))) else: fout.write('\t\t<item>%s</item>\n' % FixHTMLEntities(row[2])) elif len(row) == 3 and row[0] == '': # <TR><TD></TD><TD><B>(a)</B></TD><TD COLSPAN=2>Smithville Associates; training consultancy.</TD></TR> if subcategory: fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, FixHTMLEntities(row[1] + ' ' + row[2]))) else: fout.write('\t\t<item>%s</item>\n' % FixHTMLEntities(row[1] + ' ' + row[2])) elif len(row) == 4 and row[0] == '' and (row[1] == '' or row[1] == '<IMG SRC="3lev.gif">'): # <TR><TD></TD><TD></TD><TD>(b)</TD><TD>Great Portland Estates PLC</TD></TR> subcategorymatch = re.match("\(([ab])\)$", row[2]) if not subcategorymatch: content = FixHTMLEntities(row[2] + " " + row[3]) if subcategory: fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, content)) else: fout.write('\t\t<item>%s</item>\n' % content) else: subcategory = subcategorymatch.group(1) fout.write('\t\t(%s)\n' % subcategory) fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, FixHTMLEntities(row[3]))) else: print row raise ContextException, "Unknown row type match, length %d" % (len(row)) if category: fout.write('\t</category>\n') if needmemberend: fout.write('</regmem>\n') needmemberend = False membersetexpect = set(memberList.mpslistondate(sdate)) # check for missing/extra entries missing = membersetexpect.difference(memberset) if len(missing) > 0: print "Missing %d MP entries:\n" % len(missing), missing extra = memberset.difference(membersetexpect) if len(extra) > 0: print "Extra %d MP entries:\n" % len(extra), extra fout.write("</publicwhip>\n")
match = map(lambda x: re.sub("\xa0", "", x), match) match = map(lambda x: x.strip(), match) (url, cons, name) = match # Not in aliases file - see comment there (it's to # avoid ambiguity in debates parsing) if cons == 'Great Yarmouth' and name == 'Tony Wright': name = 'Anthony D Wright' id, canonname, canoncons = memberList.matchfullnamecons(name, cons, date_today) if not id: print >>sys.stderr, "Failed to match %s %s %s" % (name, cons, date_today) continue url = urlparse.urljoin(bbc_index_url, url) pid = memberList.membertoperson(id) if pid in bbcmembers: print >>sys.stderr, "Ignored repeated entry for " , pid else: print '<personinfo id="%s" bbc_profile_url="%s" />' % (pid, url) bbcmembers.add(pid) sys.stdout.flush() print '</publicwhip>' # Check we have everybody allmembers = sets.Set([ memberList.membertoperson(id) for id in memberList.currentmpslist() ]) symdiff = allmembers.symmetric_difference(bbcmembers) if len(symdiff) > 0:
try: attr = memberList.getmember(member) fullname = attr["firstname"] + " " + attr["lastname"] # Load search page from journa-list params = {} params['name'] = fullname params = urllib.urlencode(params) ur = urllib.urlopen("http://www.journalisted.com/list", params) content = ur.read() ur.close() # Find match count match = re.search("""<p\>(\d+) Matches\<\/p\>""", content) assert match, "%s\ndidn't find matches count %s" % (content, fullname) matches = match.groups()[0] matches = int(matches) if matches > 0: print fullname.encode('utf-8'), matches print memberList.membertoperson(member) links = re.findall("""\<li\>\<a href="([^"]+)">[^<]+\<\/a\>\<\/li\>""", content) assert links, "%s\ndidn't find links despite matches %s" % (content, fullname) print links except: print >>sys.stderr, "trouble with " + member
matcher += '<font face="arial,helvetica" size=2>(?:<[BI]>)?([^<]*?)</font></A>\s*' matcher += '</TD>\s*<!-- \*\*\* Signatures -->.*?' matcher += '(?:<font face="arial,helvetica" size=2>(?:<[BI]>)?(\d+) </font>\s*)?' matcher += '</TD>\s*<!-- \*\*\* Motion date \*\*\* -->.*?' matcher += '<font face="arial,helvetica" size=2>(?:<[BI]>)?(\d\d)\.(\d\d)\.(\d\d)</FONT>' matcher += '(?s)' sessions = {'05':'2005', '':'2004', '04':'2004', '03':'2003', '02':'2002', '01':'2001', '00':'2000', '99':'1999', '98':'1998', '97':'1997'} signers = {} edms = {} sigs = {} primary = {} session = sys.argv[1] for memberurl in edmList.edmlookups: pid = memberList.membertoperson(edmList.lookup(memberurl)) m = re.search('=(.*?)SlAsHcOdEsTrInG(.*)', memberurl) lastname = urllib.unquote(m.group(1)) firstname = urllib.unquote(m.group(2)) pnum = int(re.sub('uk.org.publicwhip/person/','',pid)) # print >> sys.stderr, "Member:%s, ID:%s, session:%s" % (memberurl,pid,sessions[session]) content = get_member(memberurl, pnum, session) if re.search('no EDMs', content): continue; for fix in fixes: content = re.sub(fix[0], fix[1], content) m = re.search('ound (\d+) EDMs? signed', content) total = int(m.group(1)) matches = re.findall(matcher, content) count = 0 for (type, ref, url, title, num, day, month, year) in matches:
if constituency == "Trumpton": # i didn't know james was religious continue if constituency == "Stefstown": # i didn't know stef was knighted continue try: mp_id, name, cons = memberList.matchfullnamecons( mp_name, constituency, made_date) except Exception, e: print >> sys.stderr, "FaxYourMP name match failed", e else: if not mp_id: print >> sys.stderr, "FaxYourMP name match failed %s, %s" % ( mp_name, constituency) else: id = memberList.membertoperson(mp_id) if vote.lower() == "no": nohash[id] = nohash.get(id, 0) + 1 elif vote.lower() == "yes" or vote.lower() == "yes" + chr(160): yeshash[id] = yeshash.get(id, 0) + 1 elif vote == "": # print >>sys.stderr, "Blank vote" # Ignore for now pass else: print >> sys.stderr, standee_name, made, made_date, vote, constituency, mp_name, "--", messagetype, id, name, cons print >> sys.stderr, "Strange vote %s" % vote ih.close()
matches.append(('/wiki/Nicholas_Johnston', 'Nick Johnston')) for (url, name) in matches: id_list = None #cons = cons.decode('utf-8') #cons = cons.replace('&', '&') name = name.decode('utf-8') try: id_list = memberList.match_string_somehow(name, None, '', True) except Exception, e: print >>sys.stderr, e if not id_list: continue for id_to_add in id_list: pid = memberList.membertoperson(id_to_add) wikimembers[pid] = url print '''<?xml version="1.0" encoding="ISO-8859-1"?> <publicwhip>''' k = wikimembers.keys() k.sort() for id in k: url = urlparse.urljoin(wiki_index_urls[0], wikimembers[id]) print '<personinfo id="%s" wikipedia_url="%s" />' % (id, url) print '</publicwhip>' wikimembers = set(wikimembers.keys()) allmembers = set([ memberList.membertoperson(id) for id in memberList.list_all_dates() ]) symdiff = allmembers.symmetric_difference(wikimembers)
(url, cons, name) = match # Not in aliases file - see comment there (it's to # avoid ambiguity in debates parsing) if cons == 'Great Yarmouth' and name == 'Tony Wright': name = 'Anthony D Wright' id, canonname, canoncons = memberList.matchfullnamecons( name, cons, date_today) if not id: print >> sys.stderr, "Failed to match %s %s %s" % (name, cons, date_today) continue url = urlparse.urljoin(bbc_index_url, url) pid = memberList.membertoperson(id) if pid in bbcmembers: print >> sys.stderr, "Ignored repeated entry for ", pid else: print '<personinfo id="%s" bbc_profile_url="%s" />' % (pid, url) bbcmembers.add(pid) sys.stdout.flush() print '</publicwhip>' # Check we have everybody allmembers = sets.Set( [memberList.membertoperson(id) for id in memberList.currentmpslist()]) symdiff = allmembers.symmetric_difference(bbcmembers)
for member in allmembers: try: attr = memberList.getmember(member) fullname = attr["firstname"] + " " + attr["lastname"] # Load search page from journa-list params = {} params['name'] = fullname params = urllib.urlencode(params) ur = urllib.urlopen("http://www.journalisted.com/list", params) content = ur.read() ur.close() # Find match count match = re.search("""<p\>(\d+) Matches\<\/p\>""", content) assert match, "%s\ndidn't find matches count %s" % (content, fullname) matches = match.groups()[0] matches = int(matches) if matches > 0: print fullname.encode('utf-8'), matches print memberList.membertoperson(member) links = re.findall( """\<li\>\<a href="([^"]+)">[^<]+\<\/a\>\<\/li\>""", content) assert links, "%s\ndidn't find links despite matches %s" % ( content, fullname) print links except: print >> sys.stderr, "trouble with " + member
if constituency == "South Tomshire": # better keep rosa's membership of parliament secret continue if constituency == "Trumpton": # i didn't know james was religious continue if constituency == "Stefstown": # i didn't know stef was knighted continue try: mp_id, name, cons = memberList.matchfullnamecons(mp_name, constituency, made_date) except Exception, e: print >>sys.stderr, "FaxYourMP name match failed", e else: if not mp_id: print >>sys.stderr, "FaxYourMP name match failed %s, %s" % (mp_name, constituency) else: id = memberList.membertoperson(mp_id) if vote.lower() == "no": nohash[id] = nohash.get(id, 0) + 1 elif vote.lower() == "yes" or vote.lower() == "yes"+chr(160): yeshash[id] = yeshash.get(id, 0) + 1 elif vote == "": # print >>sys.stderr, "Blank vote" # Ignore for now pass else: print >>sys.stderr, standee_name,made, made_date,vote,constituency,mp_name,"--",messagetype, id, name, cons print >>sys.stderr, "Strange vote %s" % vote ih.close() def responsiveness(id):