Python matchfullnameconsの例、resolvemembernames.memberList.matchfullnamecons Pythonの例

コード例 #1

0

ファイルを表示

ファイル: filterdivision.py プロジェクト: scotm/parlparse

def MpTellerList(fsm, vote, stampurl, sdate):
	res = [ ]
	for fss in fsm:
		if fss == '</b>': continue # The end </b> on Tellers for the (Ayes|Noes):
		if fss == '<b> and</b>': continue # The 'and' now gets a paragraph of its own
		while fss: # split by lines, but linefeed sometimes missing
			gftell = re.match('\s*(?:and )?([ \w.\-\'&#;]*?)(?:\(([ \w.\-\'&#;]*)\))?(?: and(.*))?\s*\.?\s*$', fss)
			if not gftell:
				raise ContextException("no match on teller line", stamp=stampurl, fragment=fss)

			fssf = gftell.group(1)
			fssfcons = gftell.group(2)
			fss = gftell.group(3)

			if len(res) >= 2:
				print fsm
				raise ContextException(' too many tellers ', stamp=stampurl, fragment=fss)

			# It always is
			if fssf == 'Mr. Michael Foster':
				fssfcons = 'Worcester'

			(mpid, remadename, remadecons) = memberList.matchfullnamecons(fssf.strip(), fssfcons, sdate)
                        #print fssf, " ++> ", remadename.encode("latin-1")
			if not mpid:
				raise ContextException("teller name bad match", stamp=stampurl, fragment=fssf)
			res.append('\t<mpname id="%s" vote="%s" teller="yes">%s</mpname>' % (mpid, vote, FixHTMLEntities(fssf)))

	return res

コード例 #2

0

ファイルを表示

ファイル: filtersentence.py プロジェクト: scotm/parlparse

def TokenHonFriend(mhonfriend, phrtok):
	# will match for ids
	orgname = mhonfriend.group(2)
	res = memberList.matchfullnamecons(orgname, mhonfriend.group(1), phrtok.sdate, alwaysmatchcons = False)
	if not res[0]:  # comes back as None
		nid = "unknown"
		mname = orgname
	else:
		nid = res[0]
		mname = res[1]
	assert not re.search("&", mname), mname
	
	# remove any xml entities from the name
	orgname = res[1]

	# if you put the .encode("latin-1") on the res[1] it doesn't work when there are strange characters.
	return ('phrase', (' class="honfriend" id="%s" name="%s"' % (nid, orgname)).encode("latin-1"))

コード例 #3

0

ファイルを表示

ファイル: division.py プロジェクト: samknight/parlparse

def MpTellerList(fsm, vote, stampurl, sdate):
    res = []
    for fss in fsm:
        if fss == '</b>':
            continue  # The end </b> on Tellers for the (Ayes|Noes):
        if fss == '<b> and</b>':
            continue  # The 'and' now gets a paragraph of its own
        while fss:  # split by lines, but linefeed sometimes missing
            gftell = re.match(
                '\s*(?:and )?([ \w.\-\'&#;]*?)(?:\(([ \w.\-\'&#;]*)\))?(?: and(.*))?\s*\.?\s*$',
                fss)
            if not gftell:
                raise ContextException("no match on teller line",
                                       stamp=stampurl,
                                       fragment=fss)

            fssf = gftell.group(1)
            fssfcons = gftell.group(2)
            fss = gftell.group(3)

            if len(res) >= 2:
                print fsm
                raise ContextException(' too many tellers ',
                                       stamp=stampurl,
                                       fragment=fss)

            # It always is
            if fssf == 'Mr. Michael Foster':
                fssfcons = 'Worcester'

            (mpid, remadename, remadecons) = memberList.matchfullnamecons(
                fssf.strip(), fssfcons, sdate)
            #print fssf, " ++> ", remadename.encode("latin-1")
            if not mpid:
                raise ContextException("teller name bad match",
                                       stamp=stampurl,
                                       fragment=fssf)
            res.append(
                '\t<mpname person_id="%s" vote="%s" teller="yes">%s</mpname>' %
                (mpid, vote, FixHTMLEntities(fssf)))

    return res

コード例 #4

0

ファイルを表示

ファイル: filtersentence_xml.py プロジェクト: mashedkeyboard/parlparse

def TokenHonFriend(mhonfriend, phrtok):
    # will match for ids
    orgname = mhonfriend.group(2)
    res = memberList.matchfullnamecons(orgname,
                                       mhonfriend.group(1),
                                       phrtok.sdate,
                                       alwaysmatchcons=False)
    if not res[0]:  # comes back as None
        nid = "unknown"
        mname = orgname
    else:
        nid = res[0]
        mname = res[1]
    assert not re.search("&", mname), mname

    # remove any xml entities from the name
    orgname = res[1]

    # if you put the .encode("latin-1") on the res[1] it doesn't work when there are strange characters.
    return ('phrase', (' class="honfriend" person_id="%s" name="%s"' %
                       (nid, orgname)).encode("latin-1"))

コード例 #5

0

ファイルを表示

#! /usr/bin/python
# -*- coding: utf-8 -*-

import sys
from resolvemembernames import memberList
from lords.resolvenames import lordsList

print memberList.matchfullnamecons(u"Si\xf4n Simon", "Birmingham Erdington", "2006-01-22")
sys.exit(0)

print lordsList.GetLordIDfname('Baroness Thatcher', None, '2006-05-01')
print lordsList.GetLordIDfname('The Archbishop of York', None, '2006-05-01')
print lordsList.GetLordIDfname('The Bishop of Southwell and Nottingham', None, '2006-05-01')

print memberList.matchfullnamecons("Anne Moffat", "East Lothian", "2006-01-22")
print memberList.matchfullnamecons("Anne Picking", "East Lothian", "2006-01-22")
print memberList.matchfullnamecons("Anne Moffat", "East Lothian", "2004-01-22")
print memberList.matchfullnamecons("Anne Picking", "East Lothian", "2004-01-22")

print memberList.canonicalcons("Aberdeen North", "2001-01-01")
print memberList.canonicalcons("Aberdeen North", "2005-05-06")

print memberList.matchdebatename("Solicitor-General", None, "2003-11-21")
print memberList.matchdebatename("The Advocate-General for Scotland", None, "2004-07-30")

print memberList.getmembersoneelection("uk.org.publicwhip/member/1238")
print memberList.getmembersoneelection("uk.org.publicwhip/member/1353")
print memberList.getmembersoneelection("uk.org.publicwhip/member/1357")

print memberList.matchdebatename("Mr. Mackay", None, "2003-11-21")
print memberList.matchdebatename("James Marshall", None, "2003-11-21")

コード例 #6

0

ファイルを表示

ファイル: writetothemfympconv.py プロジェクト: damncabbage/publicwhip

print "name, constituency, email, fax, phone, constituencyfax"
for row in csvreader:
    if row == ["</b>"]:
        break

    origname, region, email, fax, phone, constituencyfax, image_file = map(string.strip, row)

    # ambiguous names
    cons = None
    if origname == "Mr Gareth Thomas":
        cons = "Clwyd West"
    if origname == "Mr Gareth R. Thomas":
        cons = "Harrow West"
    if origname == "Mr Michael Foster":
        cons = "Hastings and Rye"
    if origname == "Mr Michael J. Foster":
        cons = "Worcester"
    if origname == "Mr Anthony D. Wright":
        cons = "Great Yarmouth"
    if origname == "Dr Tony Wright":
        cons = "Cannock Chase"

    id, name, cons =  memberList.matchfullnamecons(origname, cons, date_today)
    if id == None:
        raise Exception("Failed to match '%s'" % origname)

    row = [name, cons, email, fax, phone, constituencyfax]
    row = [x.encode("latin-1") for x in row];
    csvwriter.writerow(row);

コード例 #7

0

ファイルを表示

ファイル: wikipedia-standingdown.py プロジェクト: samknight/parlparse

# Copyright (C) 2009 Matthew Somerville
# This is free software, and you are welcome to redistribute it under
# certain conditions.  However, it comes with ABSOLUTELY NO WARRANTY.
# For details see the file LICENSE.html in the top level of the source.

import datetime
import sys
import urlparse
import re

sys.path.append("../pyscraper")
from resolvemembernames import memberList

today = '2010-04-12'

page = open('../rawdata/MPs_standing_down_in_2010').read()

print '''<?xml version="1.0" encoding="ISO-8859-1"?>
<publicwhip>'''
m = re.findall('<li><a href="([^"]*)"[^>]*>([^<]*)</a>', page)
for row in m:
    url, name = row
    name = name.decode('utf-8')
    if name in ('Iris Robinson', 'Ashok Kumar', 'David Taylor'): continue
    id, canonname, canoncons = memberList.matchfullnamecons(name, None, today) 
    pid = memberList.membertoperson(id)
    print ('  <personinfo id="%s" name="%s" standing_down="1" />' % (pid, name)).encode('iso-8859-1')
print '</publicwhip>'

コード例 #8

0

ファイルを表示

ファイル: guardianconv.py プロジェクト: JonathanBowker/parlparse

print '''<?xml version="1.0" encoding="ISO-8859-1"?>
<publicwhip>
'''

ih = open(input, 'r')

c = 0
for l in ih:
    c = c + 1
    origname, origcons, personurl, consurl = map(string.strip, l.split("\t"))
    origname = re.sub("^(.*), (.*)$", '\\2 \\1', origname)

    # Match the name, and output basic URLs
    print >>sys.stderr, "Working on %s %s" % (origname, origcons)
    id, name, cons =  memberList.matchfullnamecons(origname, origcons, date)
    #print  >>sys.stderr, "ID %s name %s cons %s" % (id, name, cons)
    personid = memberList.membertoperson(id)
    cons = cons.replace("&", "&amp;")

    print '<personinfo id="%s" guardian_mp_summary="%s" />' % (personid, personurl)
    url_match = re.search('^http://www.guardian.co.uk/politics/person/(\d+)/(.*)$', personurl)
    guardian_aristotle_id = url_match.group(1)
    print '<personinfo id="%s" guardian_aristotle_id="%s" />' % (personid, guardian_aristotle_id)
    print '<consinfo canonical="%s" guardian_election_results="%s" />' % (cons.encode("latin-1"), consurl)

    # Majority
    setsameelection =  memberList.getmembersoneelection(id)
    #print setsameelection

    # Grab swing from the constituency page

コード例 #9

0

ファイルを表示

ファイル: filter.py プロジェクト: JonathanBowker/parlparse

def RunRegmemFilters(fout, text, sdate, sdatever):
    if sdate >= '2010-09-01':
        return RunRegmemFilters2010(fout, text, sdate, sdatever)

    # message for cron so I check I'm using this
    print "New register of members interests!  Check it is working properly (via mpinfoin.pl) - %s" % sdate

    text = ApplyFixSubstitutions(text, sdate, fixsubs)

    WriteXMLHeader(fout)
    fout.write("<publicwhip>\n")

    text = re.sub('Rt Shaun', 'Shaun', text)  # Always get his name wrong
    text = re.sub('&#128;', '&#163;',
                  text)  # Always get some pound signs wrong
    rows = re.findall("<TR>(.*)</TR>", text)
    rows = [re.sub("&nbsp;", " ", row) for row in rows]
    rows = [re.sub("<B>|</B>|<BR>|`", "", row) for row in rows]
    rows = [
        re.sub('<span style="background-color: #FFFF00">|</span>', '', row)
        for row in rows
    ]
    rows = [re.sub('<IMG SRC="3lev.gif">', "", row) for row in rows]
    rows = [re.sub("&#173;", "-", row) for row in rows]
    rows = [
        re.sub('\[<A NAME="n\d+"><A HREF="\#note\d+">\d+</A>\]', '', row)
        for row in rows
    ]
    rows = [re.sub('\[<A NAME="n\d+">\d+\]', '', row) for row in rows]

    # Fix incorrect tabling of categories when highlighting is in play
    rows = [
        re.sub('<TD COLSPAN=4>(\d\.) ([^<]*?)</TD>',
               r'<TD>\1</TD><TD COLSPAN=3>\2</TD>', row) for row in rows
    ]
    # split into cells within a row
    rows = [re.findall("<TD.*?>\s*(.*?)\s*</TD>", row) for row in rows]

    memberset = set()
    needmemberend = False
    category = None
    categoryname = None
    subcategory = None
    for row in rows:
        striprow = re.sub('</?[^>]+>', '', "".join(row))
        #print row
        if striprow.strip() == "":
            # There is no text on the row, just tags
            pass
        elif len(row) == 1 and re.match("(?i)(<i>)? +(</i>)?", row[0]):
            # <TR><TD COLSPAN=4>&nbsp;</TD></TR>
            pass
        elif len(row) == 1:
            # <TR><TD COLSPAN=4><B>JACKSON, Robert (Wantage)</B></TD></TR>
            res = re.search("^([^,]*), ([^(]*) \((.*)\)$", row[0])
            if not res:
                print row
                raise ContextException, "Failed to break up into first/last/cons: %s" % row[
                    0]
            (lastname, firstname, constituency) = res.groups()
            constituency = constituency.replace(')', '')
            constituency = constituency.replace('(', '')
            firstname = memberList.striptitles(firstname)[0]

            # Register came out after they stood down
            if (firstname == 'Ian' and lastname == 'GIBSON' and sdate > '2009-06-08') \
                or (firstname == 'Michael' and lastname == 'MARTIN' and sdate > '2009-06-22'):
                check_date = '2009-06-08'
            else:
                check_date = sdate
            (id, remadename, remadecons) = memberList.matchfullnamecons(
                firstname + " " + memberList.lowercaselastname(lastname),
                constituency, check_date)
            if not id:
                raise ContextException, "Failed to match name %s %s (%s) date %s" % (
                    firstname, lastname, constituency, sdate)
            if category:
                fout.write('\t</category>\n')
            if needmemberend:
                fout.write('</regmem>\n')
                needmemberend = False
            fout.write(('<regmem personid="%s" membername="%s" date="%s">\n' %
                        (id, remadename, sdate)).encode("latin-1"))
            memberset.add(id)
            needmemberend = True
            category = None
            categoryname = None
            subcategory = None
        elif len(row) == 2 and row[0] == '' and re.match('Nil\.\.?', row[1]):
            # <TR><TD></TD><TD COLSPAN=3><B>Nil.</B></TD></TR>
            fout.write('Nil.\n')
        elif len(row) == 2 and row[0] != '':
            # <TR><TD><B>1.</B></TD><TD COLSPAN=3><B>Remunerated directorships</B></TD></TR>
            if category:
                fout.write('\t</category>\n')
            digits = row[0]
            category = re.match("\s*(\d\d?)\.$", digits).group(1)
            categoryname = row[1]
            subcategory = None
            fout.write('\t<category type="%s" name="%s">\n' %
                       (category, categoryname))
        elif len(row) == 2 and row[0] == '':
            # <TR><TD></TD><TD COLSPAN=3><B>Donations to the Office of the Leader of the Liberal Democrats received from:</B></TD></TR>
            if subcategory:
                fout.write('\t\t<item subcategory="%s">%s</item>\n' %
                           (subcategory, FixHTMLEntities(row[1])))
            else:
                fout.write('\t\t<item>%s</item>\n' % FixHTMLEntities(row[1]))
        elif len(row) == 3 and row[0] == '' and row[1] == '':
            # <TR><TD></TD><TD></TD><TD COLSPAN=2>19 and 20 September 2002, two days fishing on the River Tay in Scotland as a guest of Scottish Coal. (Registered 3 October 2002)</TD></TR>
            if subcategory:
                fout.write('\t\t<item subcategory="%s">%s</item>\n' %
                           (subcategory, FixHTMLEntities(row[2])))
            else:
                fout.write('\t\t<item>%s</item>\n' % FixHTMLEntities(row[2]))
        elif len(row) == 3 and row[0] == '':
            # <TR><TD></TD><TD><B>(a)</B></TD><TD COLSPAN=2>Smithville Associates; training consultancy.</TD></TR>
            if subcategory:
                fout.write(
                    '\t\t<item subcategory="%s">%s</item>\n' %
                    (subcategory, FixHTMLEntities(row[1] + ' ' + row[2])))
            else:
                fout.write('\t\t<item>%s</item>\n' %
                           FixHTMLEntities(row[1] + ' ' + row[2]))
        elif len(row) == 4 and row[0] == '' and (row[1] == '' or row[1]
                                                 == '<IMG SRC="3lev.gif">'):
            # <TR><TD></TD><TD></TD><TD>(b)</TD><TD>Great Portland Estates PLC</TD></TR>
            subcategorymatch = re.match("\(([ab])\)$", row[2])
            if not subcategorymatch:
                content = FixHTMLEntities(row[2] + " " + row[3])
                if subcategory:
                    fout.write('\t\t<item subcategory="%s">%s</item>\n' %
                               (subcategory, content))
                else:
                    fout.write('\t\t<item>%s</item>\n' % content)
            else:
                subcategory = subcategorymatch.group(1)
                fout.write('\t\t(%s)\n' % subcategory)
                fout.write('\t\t<item subcategory="%s">%s</item>\n' %
                           (subcategory, FixHTMLEntities(row[3])))
        else:
            print row
            raise ContextException, "Unknown row type match, length %d" % (
                len(row))
    if category:
        fout.write('\t</category>\n')
    if needmemberend:
        fout.write('</regmem>\n')
        needmemberend = False

    membersetexpect = set(
        [m['person_id'] for m in memberList.mpslistondate(sdate)])

    # check for missing/extra entries
    missing = membersetexpect.difference(memberset)
    if len(missing) > 0:
        print "Missing %d MP entries:\n" % len(missing), missing
    extra = memberset.difference(membersetexpect)
    if len(extra) > 0:
        print "Extra %d MP entries:\n" % len(extra), extra

    fout.write("</publicwhip>\n")

コード例 #10

0

ファイルを表示

ファイル: filter.py プロジェクト: mysociety/parlparse

def RunRegmemFilters2010(fout, text, sdate, sdatever):
        print "2010-? new register of members interests!  Check it is working properly (via mpinfoin.pl) - %s" % sdate

        WriteXMLHeader(fout)
	fout.write("<publicwhip>\n")
        
        memberset = set()
        text = re.sub('<span class="highlight">([^<]*?)</span>', r'\1', text)
        t = BeautifulStoneSoup(text)
        for page in t('page'):
                title = page.h2.renderContents()
                if title in ('HAGUE, Rt Hon William (Richmond (Yorks)', 'PEARCE, Teresa (Erith and Thamesmead'):
                        title += ')'
                res = re.search("^([^,]*), ([^(]*) \((.*)\)\s*$", title)
                if not res:
                        raise ContextException, "Failed to break up into first/last/cons: %s" % title
                (lastname, firstname, constituency) = res.groups()
                firstname = memberList.striptitles(firstname.decode('utf-8'))[0]
                lastname = lastname.decode('utf-8')
                if sdate < '2015-06-01':
                    lastname = memberList.lowercaselastname(lastname)
                constituency = constituency.decode('utf-8')
                lastname = lastname.replace(u'O\u2019brien', "O'Brien") # Hmm
                (id, remadename, remadecons) = memberList.matchfullnamecons(firstname + " " + lastname, constituency, sdate)
                if not id:
                        raise ContextException, "Failed to match name %s %s (%s) date %s\n" % (firstname, lastname, constituency, sdate)
                fout.write(('<regmem personid="%s" membername="%s" date="%s">\n' % (id, remadename, sdate)).encode("latin-1"))
                memberset.add(id)
                category = None
                categoryname = None
                subcategory = None
                record = False
                for row in page.h2.findNextSiblings():
                        text = row.renderContents().decode('utf-8').encode('iso-8859-1', 'xmlcharrefreplace')
                        if row.get('class') == 'spacer':
                            if record:
                                fout.write('\t\t</record>\n')
                                record = False
                            continue
                        if not text or re.match('\s*\.\s*$', text): continue
                        if text == '<strong>%s</strong>' % title: continue
                        if re.match('\s*Nil\.?\s*$', text):
                                fout.write('Nil.\n')
                                continue
                        # Since 2015 election, register is all paragraphs, no headings :(
                        if row.name == 'h3' or row.get('class') == 'shd0' or re.match('<strong>\d+\. ', text):
                                if re.match('\s*$', text): continue
                                m = re.match("(?:\s*<strong>)?\s*(\d\d?)\.\s*(.*)(?:</strong>\s*)?$", text)
                                if m:
                                        if record:
                                            fout.write('\t\t</record>\n')
                                            record = False
                                        if category:
                                                fout.write('\t</category>\n')
                                        category, categoryname = m.groups()
                                        subcategory = None
                                        categoryname = re.sub('<[^>]*>(?s)', '', categoryname).strip()
                                        fout.write('\t<category type="%s" name="%s">\n' % (category, categoryname))
                                        continue
                        if not record:
                            fout.write('\t\t<record>\n')
                            record = True
                        subcategorymatch = re.match("\s*\(([ab])\)\s*(.*)$", text)
                        if subcategorymatch:
                                subcategory = subcategorymatch.group(1)
                                fout.write('\t\t\t(%s)\n' % subcategory)
                                fout.write('\t\t\t<item subcategory="%s">%s</item>\n' % (subcategory, subcategorymatch.group(2)))
                                continue
                        if subcategory:
                                fout.write('\t\t\t<item subcategory="%s">%s</item>\n' % (subcategory, text))
                        else:
                                fout.write('\t\t\t<item>%s</item>\n' % text)
                if record:
                    fout.write('\t\t</record>\n')
                    record = False
                if category:
                        fout.write('\t</category>\n')
                fout.write('</regmem>\n')                                

        membersetexpect = set([m['person_id'] for m in memberList.mpslistondate(sdate)])
        
        # check for missing/extra entries
        missing = membersetexpect.difference(memberset)
        if len(missing) > 0:
                print "Missing %d MP entries:\n" % len(missing), missing
        extra = memberset.difference(membersetexpect)
        if len(extra) > 0:
                print "Extra %d MP entries:\n" % len(extra), extra

	fout.write("</publicwhip>\n")

コード例 #11

0

ファイルを表示

    if made[0:4] != "2004":
        continue
    made_date = mx.DateTime.DateTimeFrom(made).date

    constituency = constituency.replace('\\', '')
    mp_name = mp_name.replace('\\', '')

    if constituency == "South Tomshire":  # better keep rosa's membership of parliament secret
        continue
    if constituency == "Trumpton":  # i didn't know james was religious
        continue
    if constituency == "Stefstown":  # i didn't know stef was knighted
        continue

    try:
        mp_id, name, cons = memberList.matchfullnamecons(
            mp_name, constituency, made_date)
    except Exception, e:
        print >> sys.stderr, "FaxYourMP name match failed", e
    else:
        if not mp_id:
            print >> sys.stderr, "FaxYourMP name match failed %s, %s" % (
                mp_name, constituency)
        else:
            id = memberList.membertoperson(mp_id)
            if vote.lower() == "no":
                nohash[id] = nohash.get(id, 0) + 1
            elif vote.lower() == "yes" or vote.lower() == "yes" + chr(160):
                yeshash[id] = yeshash.get(id, 0) + 1
            elif vote == "":
                # print >>sys.stderr, "Blank vote"
                # Ignore for now

コード例 #12

0

ファイルを表示

ファイル: filterdivision.py プロジェクト: scotm/parlparse

def MpList(fsm, vote, stampurl, sdate):
	# Merge lone listed constituencies onto end of previous line
	newfsm = []
	for fss in fsm:
		if not fss: continue
		if reconstnm.match(fss):
			# print "constnm only %s appending to previous line %s" % (fss, newfsm[-1])
			newfsm[-1] += " " + fss
		else:
			newfsm.append(fss)

	res = [ ]
	pfss = ''

	multimatches = { }  # from tuple to number of matches accounted, and name

	for fss in newfsm:
		#print "fss ", fss

		# break up concattenated lines
		# Beresford, Sir PaulBlunt, Crispin

		while re.search('\S', fss):
			# there was an & in [A-Z] on line below, but it broke up this incorrectly:
			# Simon, Si&#244;n <i>(B'ham Erdington)</i>
			regsep = re.search('(.*?,.*?(?:[a-z]|</i>|\.|\)))([A-Z].*?,.*)$', fss)
			regsep2 = re.match('(.*?,.*?)  ([A-Z].*?,.*)$', fss)
			if regsep and not re.search('  Mc$', regsep.group(1)):
				fssf = regsep.group(1)
				fss = regsep.group(2)
			elif regsep2:
				fssf = regsep2.group(1)
				fss = regsep2.group(2)
			else:
				fssf = fss
				fss = ''

			# check alphabetical - but "rh" and so on confound so don't bother
			#if pfss and (pfss > fssf):
			#	print pfss, fssf
			#	raise Exception, ' out of alphabetical order %s and %s' % (pfss, fssf)
			#pfss = fssf

			# flipround the name
			# Bradley, rh Keith <i>(Withington)</i>
			# Simon, Sio(r)n <i>(Withington)</i>
			#print "fssf ", fssf
			ginp = reflipname.match(fssf)
			if ginp:
				#print "grps ", ginp.groups()
				fnam = '%s %s' % (ginp.group(2), ginp.group(1))
				cons = ginp.group(3)

			# name not being flipped, is firstname lastname
			else:
				ginp = renoflipname.match(fssf)
				if not ginp:
					raise ContextException("No flipped or non-flipped name match (filterdivision)", stamp=stampurl, fragment=fssf)
				fnam = ginp.group(1);
				cons = ginp.group(2);

			#print "fss ", fssf
			(mpid, remadename, remadecons) = memberList.matchfullnamecons(fnam, cons, sdate, alwaysmatchcons = False)
			if not mpid and remadename == "MultipleMatch":
				assert type(remadecons) == tuple  # actually the list of ids
				i = len(multimatches.setdefault(remadecons, []))  # the index we work with
				if i >= len(remadecons):
					print "Name", fnam, "used too many times for list", remadecons, "where other instances are", multimatches[remadecons]
					raise ContextException("Too many instances", stamp=stampurl, fragment=fnam)
				mpid = remadecons[i]
				multimatches[remadecons].append(fnam)

				# appears with multiple matching which is ignorable when both ambiguous people vote on same side of a division
				#print "For name", fnam, "returning id", mpid, ";", i, " out of ", remadecons

			elif not mpid and remadename != "MultipleMatch":
				print "filterdivision.py: no match for", fnam, cons, sdate
				raise ContextException("No match on name", stamp=stampurl, fragment=fnam)
			#print fnam, " --> ", remadename.encode("latin-1")
			res.append('\t<mpname id="%s" vote="%s">%s</mpname>' % (mpid, vote, FixHTMLEntities(fssf)))

	# now we have to check if the multimatched names were all exhausted
	for ids in multimatches:
		if len(multimatches[ids]) != len(ids):
			print "Insufficient vote matches on name", multimatches[ids], "ids taken to", ids
			raise ContextException("Not enough vote match on ambiguous name", stamp=stampurl, fragment=multimatches[ids][0])
	return res

コード例 #13

0

ファイルを表示

for line in content:
    line = line.strip()
    if not line or re.match('#', line):
        continue
    cols = line.split("\t")
    name = cols[0]
    m = re.match('(.*?), (.*)$', name)
    name = '%s %s' % (m.group(2), m.group(1))
    money = cols[1:16]
    money = map(lambda x: re.sub("\xa3", "", x), money)
    money = map(lambda x: re.sub(",", "", x), money)
    id = None
    cons = None
    if name == 'Mr Michael Foster':
        cons = 'Worcester'
    id, name, cons = memberList.matchfullnamecons(name, cons, yeardate)
    #if not id:
    #	id, name, newcons =  memberList.matchfullnamecons(first + ' ' + last, cons, otheryeardate)
    if not id:
        raise Exception, "Failed to find MP in line %s" % line
    pid = memberList.membertoperson(id)
    #	print >>sys.stderr, last, first, money
    if id in expmembers:
        print >> sys.stderr, "Ignored repeated entry for ", id
    else:
        fout.write('<personinfo id="%s" ' % pid)
        for i in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]:
            if i == 0 or i == 1 or i == 2 or i == 3:
                col = i + 1
            elif i == 4:
                col = '5a'

コード例 #14

0

ファイルを表示

def FilterWransSpeakers(fout, text, sdate):
    text = ApplyFixSubstitutions(text, sdate, fixsubs)

    # Fix things like this, to put bold in. We use bold below to detect names, but
    # occasionally the reporters miss it out, and we catch such cases here:
    # <p><a name="qnpa_0">Caroline Flint: This information is not held centrally. </p>
    # <p><a name="qnpa_15">Ms Harman: The information can be found in the following table. </p>
    missingbolds = re.findall(
        '(\n?<p>(?:<stamp aname="[^"]+"/>)+)((?:<b></b>)?\s*)([A-Za-z.\-\s]+)(:\s)',
        text)
    for p1, p2, p3, p4 in missingbolds:
        missingbold = "%s%s%s%s" % (p1, p2, p3, p4)
        bold = "%s<b>%s%s</b>" % (p1, p3, p4)
        namematches = memberList.fullnametoids(p3, sdate)
        # Only fix if we found a matching name in the middle (and do it even if ambiguous)
        if namematches:
            #print "Fixing missing bold, had name matches:\n\t%s\n\t%s" % (missingbold.strip(), bold.strip())
            if not missingbold in text:
                print "ERROR: missing bold text found, but then vanished when replacing"
            text = text.replace(missingbold, bold)
        #else:
        #print "Plausible missing bold not fixed, as no name matches:\n\t%s\n\t%s" % (missingbold.strip(), bold.strip())

    # <B> Mrs. Iris Robinson: </B>
    lspeakerregexp = '<b>.*?</b>(?:\s*:)?'
    ltableregexp = '<table[^>]*>[\s\S]*?</table>'  # these have bolds, so must be separated out
    tableregexp = ltableregexp + '(?i)'

    lregexp = '(%s|%s)(?i)' % (ltableregexp, lspeakerregexp)

    # setup for scanning through the file.
    fs = re.split(lregexp, text)

    # for error messages
    stampurl = StampUrl(sdate)

    for i in range(len(fs)):
        fss = fs[i]
        fss = stampurl.UpdateStampUrl(fss)  # Speakers have new stamps in them

        if re.match(tableregexp, fss):
            continue

        speakerg = re.findall('<b>\s*([^:]*)[:\s]*?([^<:]*)</b>(?i)', fss)
        if not speakerg:
            continue

        # we have a string in bold
        boldnamestring = string.strip(speakerg[0][0])

        # trailing text after the colon in the bold speech bit
        if re.search('\S', speakerg[0][1]):
            fs[i + 1] = speakerg[0][1] + fs[i + 1]

        # push the square brackets outside of the boldstring if there is one
        # <B> Mr. Miliband [ </B> <i>holding answer 24 March</i>]:
        sqb = re.findall('^([^\[]*)(\[.*)$', boldnamestring)
        if sqb:
            boldnamestring = string.strip(sqb[0][0])
            fs[i + 1] = sqb[0][1] + fs[i + 1]

        # get rid of blank bold strings
        if not re.search('\S', boldnamestring):
            fs[i] = ''
            continue

        # try to pull in the question number if preceding
        # These signify aborted oral questions, and are normally
        # useless and at the start of the page.
        # 27. <B> Mr. Steen: </B>
        if i > 0:
            oqnsep = re.findall(
                '^([\s\S]*?)Q?(\d+\.?)(\s*?(?:<stamp aname=".*?"/>)?)$',
                fs[i - 1])
            if oqnsep:
                fs[i - 1] = oqnsep[0][0] + oqnsep[0][2]
                boldnamestring = oqnsep[0][1] + ' ' + boldnamestring

        # take out the initial digits and a dot which we may have just put in
        # (although sometimes it would have already been there)
        robj = re.match(r"(\d*\.? )(.*)$", boldnamestring)
        deci = None
        if robj:
            (deci, boldnamestring) = robj.groups()
            # TODO: do something with deci here (it is the "failed
            # oral questions" signifier)

        # see if it is an explicitly bad/ambiguous name which will never match
        if boldnamestring.find('<broken-name>') >= 0:
            person_id = 'unknown'
            boldnamestring = boldnamestring.replace('<broken-name>', '')
            remadename = ' speakername="%s" error="Name ambiguous in Hansard"' % (
                boldnamestring)
        else:
            # split bracketed cons out if present
            brakmatch = re.match("(.*)\s+\((.*)\)", boldnamestring)
            if brakmatch:
                (name, cons) = brakmatch.groups()
            else:
                (name, cons) = (boldnamestring, None)

            # match the member to a unique identifier
            (person_id, remadename,
             remadecons) = memberList.matchfullnamecons(name,
                                                        cons,
                                                        sdate,
                                                        alwaysmatchcons=False)
            if person_id and remadename:
                remadename = ' speakername="%s"' % (remadename)
            if not person_id:
                if remadename == "MultipleMatch":
                    if boldnamestring == 'Mr. Michael Foster':
                        if remadecons[0] == 'uk.org.publicwhip/person/10209':
                            person_id = remadecons[0]
                            remadename = ' speakername="Michael Foster"'
                            remadecons = 'Worcester'
                    else:
                        person_id = 'unknown'
                        remadename = ' speakername="%s" error="MultipleMatch"' % boldnamestring
                elif boldnamestring == 'Jim Dobbin' and sdate == '2014-09-08':
                    person_id = 'uk.org.publicwhip/person/10170'
                    remadename = ' speakername="Jim Dobbin"'
                else:
                    print "  No name,const match (%s,%s)" % (name, cons)
                    raise ContextException("No name match",
                                           stamp=stampurl,
                                           fragment=boldnamestring)

        # put record in this place
        fs[i] = '<speaker person_id="%s"%s>%s</speaker>\n' % \
          (person_id.encode("latin-1"), remadename.encode("latin-1"), boldnamestring)

    # scan through everything and output it into the file
    fout.writelines(fs)

コード例 #15

0

ファイルを表示

    # print i + 1
    matcher = '<a\s*href="(/1/shared/mpdb/html/\d+.stm)" title="Profile of the MP for (.*?)(?: \(.*?\))?"><b>\s*([\s\S]*?)\s*</b></a></td>'
    matches = re.findall(matcher, content)
    for match in matches:
        match = map(lambda x: re.sub("&amp;", "&", x), match)
        match = map(lambda x: re.sub("\s+", " ", x), match)
        match = map(lambda x: re.sub("\xa0", "", x), match)
        match = map(lambda x: x.strip(), match)
        (url, cons, name) = match

        # Not in aliases file - see comment there (it's to
        # avoid ambiguity in debates parsing)
        if cons == 'Great Yarmouth' and name == 'Tony Wright':
            name = 'Anthony D Wright'

        id, canonname, canoncons = memberList.matchfullnamecons(
            name, cons, date_today)
        if not id:
            print >> sys.stderr, "Failed to match %s %s %s" % (name, cons,
                                                               date_today)
            continue
        url = urlparse.urljoin(bbc_index_url, url)

        pid = memberList.membertoperson(id)
        if pid in bbcmembers:
            print >> sys.stderr, "Ignored repeated entry for ", pid
        else:
            print '<personinfo id="%s" bbc_profile_url="%s" />' % (pid, url)

        bbcmembers.add(pid)

    sys.stdout.flush()

コード例 #16

0

ファイルを表示

    cons = None
    if file == "thomas_gareth_591.jpg":
        cons = "Clwyd West"
    if file == "thomas_gareth_r_592.jpg":
        cons = "Harrow West"
    if file == "wright_tony_w_654.jpg":
        cons = "Cannock Chase"
    if file == "wright_tony_653.jpg":
        cons = "Great Yarmouth"

    last = last.replace("_", " ")
    fullname = "%s %s" % (first, last)
    fullname = memberList.fixnamecase(fullname)
    (id, correctname,
     correctcons) = memberList.matchfullnamecons(fullname, cons, photodate)
    id = memberList.membertoperson(id)
    id = id.replace("uk.org.publicwhip/person/", "")

    renamemap[file] = "%s.jpg" % id

    # print file, renamemap[file]

assert len(
    renamemap.keys()) == 659, "got %d keys, not 659" % len(renamemap.keys())

# sys.exit(1)

# Do renaming
for name, newname in renamemap.iteritems():
    assert not os.path.exists(newname), "file %s already exists" % newname

コード例 #17

0

ファイルを表示

ファイル: expenses2008.py プロジェクト: JonathanBowker/parlparse

	    continue
	lastname = first_col
	firstname_and_honorific = firstname_from_string(cols[1])
	
	name = '%s %s' % (firstname_and_honorific, lastname)
	name = name.decode("latin-1", "replace")
	money = cols[2:28]
	money = map(lambda x: re.sub("\xa3","", x), money)
	money = map(lambda x: re.sub(",","", x), money)
	money = map(lambda x: re.sub(".00$","", x), money)
	id = None
	cons = None
	# other Michael Foster is Michael Jabez Foster
  	if name == 'Mr Michael Foster':
		cons = 'Worcester'
	id, found_name, cons =  memberList.matchfullnamecons(name, cons, yeardate)
	if not id:
		id, found_name, newcons =  memberList.matchfullnamecons(name, cons, otheryeardate)
	if not id:
		raise Exception, "Failed to find MP in line %s %d" % (line, line_index)
	pid = memberList.membertoperson(id)
	# print >>sys.stderr, lastname, firstname_and_honorific, money
	if id in expmembers:
		print >>sys.stderr, "Ignored repeated entry for " , id
	else:
		fout.write('<personinfo id="%s" ' % pid)
		expense_cols = ['total_inc_travel',
					    'total_exc_travel', 
					    'total_travel',
					    '1',
						'2',

コード例 #18

0

ファイルを表示

ファイル: expenses2009.py プロジェクト: samknight/parlparse

fout.write('''<?xml version="1.0" encoding="ISO-8859-1"?>
<publicwhip>\n''')

content = csv.reader(open('../rawdata/mpsexpenses200809.txt'))
for cols in content:
    if cols[0] == 'ID': continue  # Header
    #if cols[1] == 'TOTALS': continue # Footer
    name = cols[0].decode('utf-8')
    #party = cols[2]
    #cons = cols[3].decode('utf-8')
    money = cols[1:]
    money = map(lambda x: re.sub("\xa3", "", x), money)
    money = map(lambda x: re.sub(",", "", x), money)
    id = None
    cons = None
    id, found_name, newcons = memberList.matchfullnamecons(
        name, cons, '2008-05-01')
    if not id:
        id, found_name, newcons = memberList.matchfullnamecons(
            name, cons, '2008-12-01')
    if not id:
        raise Exception, "Failed to find MP %s" % name
    pid = memberList.membertoperson(id)
    fout.write('<personinfo id="%s" ' % pid)
    expense_cols = [
        '1', '2', '3', '4', 'total_travel', 'stationery', '9',
        'comms_allowance'
    ]
    total = 0
    for i in range(8):
        col = expense_cols[i]
        total += float(money[i].strip())

コード例 #19

0

ファイルを表示

    #<td><a href="/wiki/Lyn_Brown" title="Lyn Brown">Lyn Brown</a></td>
    #<td>Labour</td>
    matcher = '<tr>\s+<td><a href="/wiki/[^"]+" [^>]*?title="[^"]+">([^<]+)</a>(?:<br />\s+<small>.*?</small>)?\s*</td>\s+(?:<td[^>]*>\s*</td>\s*<td[^>]*><a[^>]*>[^<]*</a>\s*</td>\s*<td[^>]*>\s*</td>\s*)?<td>(?:(?:<span class="sortkey">[^<]*</span>|<span data-sort-value="[^"]*">)<span class="vcard"><span class="fn">)?(?:Dr |Sir |The Rev\. )?<a href="(/wiki/[^"]+)" [^>]*?title="[^"]+"[^>]*>([^<]+)</a>(?:(?:</span>){2,3})?(?:&#160;\(.*?\))?\s*</td>|by-election,[^"]+">([^<]+)</a> [^ ]{1,3} <a href="(/wiki/[^"]+)" title="[^"]+">([^<]+)</a>'
    matches = re.findall(matcher, content)
    for (cons, url, name, cons2, url2, name2) in matches:
        id = None
        if cons2:
            cons = cons2
            name = name2
            url = url2
        cons = cons.decode('utf-8')
        cons = cons.replace('&amp;', '&')
        name = name.decode('utf-8')
        try:
            (id, canonname,
             canoncons) = memberList.matchfullnamecons(name, cons,
                                                       date_parl[year])
        except Exception, e:
            print >> sys.stderr, e
        if not id:
            continue
        wikimembers[id] = url

print '''<?xml version="1.0" encoding="ISO-8859-1"?>
<publicwhip>'''
k = wikimembers.keys()
k.sort()
for id in k:
    url = urlparse.urljoin(wiki_index_url, wikimembers[id])
    print '<personinfo id="%s" wikipedia_url="%s" />' % (id, url)
print '</publicwhip>'

コード例 #20

0

ファイルを表示

ファイル: parlparse-ids.py プロジェクト: mhl/mysociety-cvs

# Converts triple of (name, constituency, date) into parlparse person id.
# Reads lines from standard input, each line having the triple hash-separated.
# Outputs the person ids, one per line.

import sys
import os

# Check this out from the ukparse project using Subversion:
# svn co https://scm.kforge.net/svn/ukparse/trunk/parlparse
os.chdir("../../../../parlparse/pyscraper")
sys.path.append(".")
import re
from resolvemembernames import memberList

while 1:
    sys.stdin.flush()
    line = sys.stdin.readline()
    if not line:
        break

    line = line.decode("utf-8")
    name, cons, date_today = line.split("#")

    id, canonname, canoncons = memberList.matchfullnamecons(name, cons, date_today)
    if not id:
        print >>sys.stderr, "failed to match %s (%s) %s" % (name, cons, date_today)

    person_id = memberList.membertoperson(id)
    print person_id
    sys.stdout.flush()

コード例 #21

0

ファイルを表示

ファイル: filter.py プロジェクト: mysociety/parlparse

def RunRegmemFilters(fout, text, sdate, sdatever):
        if sdate >= '2010-09-01':
                return RunRegmemFilters2010(fout, text, sdate, sdatever)

        # message for cron so I check I'm using this
        print "New register of members interests!  Check it is working properly (via mpinfoin.pl) - %s" % sdate

	text = ApplyFixSubstitutions(text, sdate, fixsubs)

        WriteXMLHeader(fout)
	fout.write("<publicwhip>\n")

        text = re.sub('Rt Shaun', 'Shaun', text) # Always get his name wrong
        text = re.sub('&#128;', '&#163;', text) # Always get some pound signs wrong
        rows = re.findall("<TR>(.*)</TR>", text)
        rows = [ re.sub("&nbsp;", " ", row) for row in rows ]
        rows = [ re.sub("<B>|</B>|<BR>|`", "", row) for row in rows ]
        rows = [ re.sub('<span style="background-color: #FFFF00">|</span>', '', row) for row in rows ]
        rows = [ re.sub('<IMG SRC="3lev.gif">', "", row) for row in rows ]
        rows = [ re.sub("&#173;", "-", row) for row in rows ]
        rows = [ re.sub('\[<A NAME="n\d+"><A HREF="\#note\d+">\d+</A>\]', '', row) for row in rows ]
        rows = [ re.sub('\[<A NAME="n\d+">\d+\]', '', row) for row in rows ]

        # Fix incorrect tabling of categories when highlighting is in play
        rows = [ re.sub('<TD COLSPAN=4>(\d\.) ([^<]*?)</TD>', r'<TD>\1</TD><TD COLSPAN=3>\2</TD>', row) for row in rows ]
        # split into cells within a row
        rows = [ re.findall("<TD.*?>\s*(.*?)\s*</TD>", row) for row in rows ]

        memberset = set()
        needmemberend = False
        category = None
        categoryname = None
        subcategory = None
        for row in rows:
                striprow = re.sub('</?[^>]+>', '', "".join(row))
                #print row
                if striprow.strip() == "":
                        # There is no text on the row, just tags
                        pass
                elif len(row) == 1 and re.match("(?i)(<i>)? +(</i>)?", row[0]):
                        # <TR><TD COLSPAN=4>&nbsp;</TD></TR>
                        pass
                elif len(row) == 1:
                        # <TR><TD COLSPAN=4><B>JACKSON, Robert (Wantage)</B></TD></TR>
                        res = re.search("^([^,]*), ([^(]*) \((.*)\)$", row[0])
                        if not res:
                                print row
                                raise ContextException, "Failed to break up into first/last/cons: %s" % row[0]
                        (lastname, firstname, constituency) = res.groups()
                        constituency = constituency.replace(')', '')
                        constituency = constituency.replace('(', '')
                        firstname = memberList.striptitles(firstname)[0]

                        # Register came out after they stood down
                        if (firstname == 'Ian' and lastname == 'GIBSON' and sdate > '2009-06-08') \
                            or (firstname == 'Michael' and lastname == 'MARTIN' and sdate > '2009-06-22'):
                                check_date = '2009-06-08'
                        else:
                                check_date = sdate
                        (id, remadename, remadecons) = memberList.matchfullnamecons(firstname + " " + memberList.lowercaselastname(lastname), constituency, check_date)
                        if not id:
                                raise ContextException, "Failed to match name %s %s (%s) date %s" % (firstname, lastname, constituency, sdate)
                        if category:
                                fout.write('\t</category>\n')
                        if needmemberend:
                                fout.write('</regmem>\n')                                
                                needmemberend = False
                        fout.write(('<regmem personid="%s" membername="%s" date="%s">\n' % (id, remadename, sdate)).encode("latin-1"))
                        memberset.add(id)
                        needmemberend = True
                        category = None
                        categoryname = None
                        subcategory = None
                elif len(row) == 2 and row[0] == '' and re.match('Nil\.\.?', row[1]):
                        # <TR><TD></TD><TD COLSPAN=3><B>Nil.</B></TD></TR> 
                        fout.write('Nil.\n')
                elif len(row) == 2 and row[0] != '':
                        # <TR><TD><B>1.</B></TD><TD COLSPAN=3><B>Remunerated directorships</B></TD></TR>
                        if category:
                                fout.write('\t</category>\n')
                        digits = row[0]
                        category = re.match("\s*(\d\d?)\.$", digits).group(1)
                        categoryname = row[1]
                        subcategory = None
                        fout.write('\t<category type="%s" name="%s">\n' % (category, categoryname))
                elif len(row) == 2 and row[0] == '':
                        # <TR><TD></TD><TD COLSPAN=3><B>Donations to the Office of the Leader of the Liberal Democrats received from:</B></TD></TR>
                        if subcategory:
                                fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, FixHTMLEntities(row[1])))
                        else:
                                fout.write('\t\t<item>%s</item>\n' % FixHTMLEntities(row[1]))
                elif len(row) == 3 and row[0] == '' and row[1] == '':
                        # <TR><TD></TD><TD></TD><TD COLSPAN=2>19 and 20 September 2002, two days fishing on the River Tay in Scotland as a guest of Scottish Coal. (Registered 3 October 2002)</TD></TR>
                        if subcategory:
                                fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, FixHTMLEntities(row[2])))
                        else:
                                fout.write('\t\t<item>%s</item>\n' % FixHTMLEntities(row[2]))
                elif len(row) == 3 and row[0] == '':
                        # <TR><TD></TD><TD><B>(a)</B></TD><TD COLSPAN=2>Smithville Associates; training consultancy.</TD></TR>
                        if subcategory:
                                fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, FixHTMLEntities(row[1] + ' ' + row[2])))
                        else:
                                fout.write('\t\t<item>%s</item>\n' % FixHTMLEntities(row[1] + ' ' + row[2]))
                elif len(row) == 4 and row[0] == '' and (row[1] == '' or row[1] == '<IMG SRC="3lev.gif">'):
                        # <TR><TD></TD><TD></TD><TD>(b)</TD><TD>Great Portland Estates PLC</TD></TR>
                        subcategorymatch = re.match("\(([ab])\)$", row[2])
                        if not subcategorymatch:
                                content = FixHTMLEntities(row[2] + " " + row[3])
                                if subcategory:
                                        fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, content))
                                else:
                                        fout.write('\t\t<item>%s</item>\n' % content)
                        else:
                                subcategory = subcategorymatch.group(1)
                                fout.write('\t\t(%s)\n' % subcategory)
                                fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, FixHTMLEntities(row[3])))
                else:
                        print row
                        raise ContextException, "Unknown row type match, length %d" % (len(row))
        if category:
                fout.write('\t</category>\n')
        if needmemberend:
                fout.write('</regmem>\n')                                
                needmemberend = False

        membersetexpect = set([m['person_id'] for m in memberList.mpslistondate(sdate)])
        
        # check for missing/extra entries
        missing = membersetexpect.difference(memberset)
        if len(missing) > 0:
                print "Missing %d MP entries:\n" % len(missing), missing
        extra = memberset.difference(membersetexpect)
        if len(extra) > 0:
                print "Extra %d MP entries:\n" % len(extra), extra

	fout.write("</publicwhip>\n")

コード例 #22

0

ファイルを表示

ファイル: test.py プロジェクト: scotm/parlparse

#! /usr/bin/python
# -*- coding: utf-8 -*-

import sys
sys.path.append('lords/')
from resolvemembernames import memberList
from resolvelordsnames import lordsList

print memberList.matchfullnamecons(u"Si\xf4n Simon", "Birmingham Erdington", "2006-01-22")
sys.exit(0)

print lordsList.GetLordIDfname('Baroness Thatcher', None, '2006-05-01')
print lordsList.GetLordIDfname('The Archbishop of York', None, '2006-05-01')
print lordsList.GetLordIDfname('The Bishop of Southwell and Nottingham', None, '2006-05-01')

print memberList.matchfullnamecons("Anne Moffat", "East Lothian", "2006-01-22")
print memberList.matchfullnamecons("Anne Picking", "East Lothian", "2006-01-22")
print memberList.matchfullnamecons("Anne Moffat", "East Lothian", "2004-01-22")
print memberList.matchfullnamecons("Anne Picking", "East Lothian", "2004-01-22")

print memberList.canonicalcons("Aberdeen North", "2001-01-01")
print memberList.canonicalcons("Aberdeen North", "2005-05-06")

print memberList.matchdebatename("Solicitor-General", None, "2003-11-21")
print memberList.matchdebatename("The Advocate-General for Scotland", None, "2004-07-30")

print memberList.getmembersoneelection("uk.org.publicwhip/member/1238")
print memberList.getmembersoneelection("uk.org.publicwhip/member/1353")
print memberList.getmembersoneelection("uk.org.publicwhip/member/1357")

print memberList.matchdebatename("Mr. Mackay", None, "2003-11-21")

コード例 #23

0

ファイルを表示

ファイル: wikipedia-commons.py プロジェクト: scotm/parlparse

#<td><a href="/wiki/West_Ham_%28UK_Parliament_constituency%29" title="West Ham (UK Parliament constituency)">West Ham</a></td>
#<td><a href="/wiki/Lyn_Brown" title="Lyn Brown">Lyn Brown</a></td>
#<td>Labour</td>
    matcher = '<tr>\s+<td><a href="/wiki/[^"]+" [^>]*?title="[^"]+">([^<]+)</a>(?:<br />\s+<small>.*?</small>)?</td>\s+(?:<td style="[^"]*"></td>\s*<td[^>]*><a[^>]*>[^<]*</a></td>\s*<td style="[^"]*"></td>\s*)?<td>(?:Dr |Sir |The Rev\. )?<a href="(/wiki/[^"]+)" [^>]*?title="[^"]+"[^>]*>([^<]+)</a>(?:&#160;\(.*?\))?</td>|by-election,[^"]+">([^<]+)</a> [^ ]{1,3} <a href="(/wiki/[^"]+)" title="[^"]+">([^<]+)</a>';
    matches = re.findall(matcher, content)
    for (cons, url, name, cons2, url2, name2) in matches:
        id = None
        if cons2:
            cons = cons2
            name = name2
            url = url2
        cons = cons.decode('utf-8')
        cons = cons.replace('&amp;', '&')
        name = name.decode('utf-8')
        try:
            (id, canonname, canoncons) = memberList.matchfullnamecons(name, cons, date_parl[year])
        except Exception, e:
            print >>sys.stderr, e
        if not id:
            continue
        pid = memberList.membertoperson(id)
        wikimembers[pid] = url

print '''<?xml version="1.0" encoding="ISO-8859-1"?>
<publicwhip>'''
k = wikimembers.keys()
k.sort()
for id in k:
    url = urlparse.urljoin(wiki_index_url, wikimembers[id])
    print '<personinfo id="%s" wikipedia_url="%s" />' % (id, url)
print '</publicwhip>'

コード例 #24

0

ファイルを表示

ファイル: photorenamer.py プロジェクト: JonathanBowker/parlparse

    (last, first, alienid) = match.groups()

    cons = None
    if file == "thomas_gareth_591.jpg":
        cons = "Clwyd West"
    if file == "thomas_gareth_r_592.jpg":
        cons = "Harrow West"
    if file == "wright_tony_w_654.jpg":
        cons = "Cannock Chase"
    if file == "wright_tony_653.jpg":
        cons = "Great Yarmouth"

    last = last.replace("_", " ")
    fullname = "%s %s" % (first, last)
    fullname = memberList.fixnamecase(fullname)
    (id, correctname, correctcons) = memberList.matchfullnamecons(fullname, cons, photodate)
    id = memberList.membertoperson(id)
    id = id.replace("uk.org.publicwhip/person/", "")

    renamemap[file] = "%s.jpg" % id

    # print file, renamemap[file]

assert len(renamemap.keys()) == 659, "got %d keys, not 659" % len(renamemap.keys())

# sys.exit(1)

# Do renaming
for name, newname in renamemap.iteritems():
    assert not os.path.exists(newname), "file %s already exists" % newname
    print name, "=>", newname

コード例 #25

0

ファイルを表示

ファイル: future-fetch.py プロジェクト: stuartlangridge/theyworkforyou

    def __init__(self, entry):
        event = entry['{http://services.parliament.uk/ns/calendar/feeds}event']
        self.id = event.attrib['id']
        self.deleted = 0
        self.link_calendar = entry.guid
        self.link_external = entry.link
        chamber = event.chamber.text.strip()
        self.chamber = '%s: %s' % (event.house.text.strip(), chamber)
        self.event_date = event.date.text
        self.time_start = getattr(event, 'startTime', None)
        self.time_end = getattr(event, 'endTime', None)

        committee_text = event.comittee.text
        if committee_text:
            committee_text = committee_text.strip()
            if chamber in ('Select Committee', 'General Committee'):
                self.committee_name = committee_text
            elif committee_text != "Prime Minister's Question Time":
                self.debate_type = committee_text

        self.people = []

        title_text = event.inquiry.text
        if title_text:
            m = re.search(' - ([^-]*)$', title_text)
            if m:
                person_texts = [x.strip() for x in m.group(1).split('/')]

                for person_text in person_texts:
                    id, name, cons = memberList.matchfullnamecons(
                        person_text, None, self.event_date)
                    if not id:
                        try:
                            id = lordsList.GetLordIDfname(
                                person_text, None, self.event_date)
                        except:
                            pass
                    if id:
                        self.people.append(
                            int(id.replace('uk.org.publicwhip/person/', '')))

                if len(self.people) == len(person_texts):
                    title_text = title_text.replace(' - ' + m.group(1), '')

            self.title = title_text.strip()
        elif committee_text == "Prime Minister's Question Time":
            self.title = committee_text

        self.witnesses = []
        witness_text = event.witnesses.text
        if witness_text == 'This is a private meeting.':
            self.title = witness_text
        elif witness_text:
            self.witnesses_str = witness_text.strip()
            m = re.findall(r'\b(\w+ \w+ MP)', self.witnesses_str)
            for mp in m:
                id, name, cons = memberList.matchfullnamecons(
                    mp, None, self.event_date)
                if not id:
                    continue
                pid = int(id.replace('uk.org.publicwhip/person/', ''))
                mp_link = '<a href="/mp/?p=%d">%s</a>' % (pid, mp)
                self.witnesses.append(pid)
                self.witnesses_str = self.witnesses_str.replace(mp, mp_link)

        location_text = event.location.text
        if location_text:
            self.location = location_text.strip()

コード例 #26

0

ファイルを表示

ファイル: expenses2008.py プロジェクト: samknight/parlparse

        continue
    lastname = first_col
    firstname_and_honorific = firstname_from_string(cols[1])

    name = '%s %s' % (firstname_and_honorific, lastname)
    name = name.decode("latin-1", "replace")
    money = cols[2:28]
    money = map(lambda x: re.sub("\xa3", "", x), money)
    money = map(lambda x: re.sub(",", "", x), money)
    money = map(lambda x: re.sub(".00$", "", x), money)
    id = None
    cons = None
    # other Michael Foster is Michael Jabez Foster
    if name == 'Mr Michael Foster':
        cons = 'Worcester'
    id, found_name, cons = memberList.matchfullnamecons(name, cons, yeardate)
    if not id:
        id, found_name, newcons = memberList.matchfullnamecons(
            name, cons, otheryeardate)
    if not id:
        raise Exception, "Failed to find MP in line %s %d" % (line, line_index)
    pid = memberList.membertoperson(id)
    # print >>sys.stderr, lastname, firstname_and_honorific, money
    if id in expmembers:
        print >> sys.stderr, "Ignored repeated entry for ", id
    else:
        fout.write('<personinfo id="%s" ' % pid)
        expense_cols = [
            'total_inc_travel', 'total_exc_travel', 'total_travel', '1', '2',
            '3', '4', '7', '7a', '8', '9', 'comms_allowance',
            'mp_reg_travel_a', 'mp_reg_travel_b', 'mp_reg_travel_c',

コード例 #27

0

ファイルを表示

ファイル: filterwransspeakers.py プロジェクト: spudmind/parlparse

def FilterWransSpeakers(fout, text, sdate):
	text = ApplyFixSubstitutions(text, sdate, fixsubs)

        # Fix things like this, to put bold in. We use bold below to detect names, but
        # occasionally the reporters miss it out, and we catch such cases here:
        # <p><a name="qnpa_0">Caroline Flint: This information is not held centrally. </p>
        # <p><a name="qnpa_15">Ms Harman: The information can be found in the following table. </p>
        missingbolds = re.findall('(\n?<p>(?:<stamp aname="[^"]+"/>)+)((?:<b></b>)?\s*)([A-Za-z.\-\s]+)(:\s)', text)
        for p1,p2,p3,p4 in missingbolds:
                missingbold = "%s%s%s%s" % (p1,p2,p3,p4)
                bold = "%s<b>%s%s</b>" % (p1,p3,p4)
                namematches = memberList.fullnametoids(p3, sdate)
                # Only fix if we found a matching name in the middle (and do it even if ambiguous)
                if namematches:
                        #print "Fixing missing bold, had name matches:\n\t%s\n\t%s" % (missingbold.strip(), bold.strip())
                        if not missingbold in text:
                                print "ERROR: missing bold text found, but then vanished when replacing"
                        text = text.replace(missingbold, bold)
                #else:
                        #print "Plausible missing bold not fixed, as no name matches:\n\t%s\n\t%s" % (missingbold.strip(), bold.strip())

	# <B> Mrs. Iris Robinson: </B>
	lspeakerregexp = '<b>.*?</b>(?:\s*:)?'
	ltableregexp = '<table[^>]*>[\s\S]*?</table>'	# these have bolds, so must be separated out
	tableregexp = ltableregexp + '(?i)'

	lregexp = '(%s|%s)(?i)' % (ltableregexp, lspeakerregexp)

	# setup for scanning through the file.
	fs = re.split(lregexp, text)

        # for error messages
	stampurl = StampUrl(sdate)


	for i in range(len(fs)):
		fss = fs[i]
		fss = stampurl.UpdateStampUrl(fss) # Speakers have new stamps in them

		if re.match(tableregexp, fss):
			continue

		speakerg = re.findall('<b>\s*([^:]*)[:\s]*?([^<:]*)</b>(?i)', fss)
		if not speakerg:
			continue

		# we have a string in bold
		boldnamestring = string.strip(speakerg[0][0])

		# trailing text after the colon in the bold speech bit
		if re.search('\S', speakerg[0][1]):
			fs[i+1] = speakerg[0][1] + fs[i+1]


		# push the square brackets outside of the boldstring if there is one
		# <B> Mr. Miliband [ </B> <i>holding answer 24 March</i>]:
		sqb = re.findall('^([^\[]*)(\[.*)$', boldnamestring)
		if sqb:
			boldnamestring = string.strip(sqb[0][0])
			fs[i+1] = sqb[0][1] + fs[i+1]

		# get rid of blank bold strings
		if not re.search('\S', boldnamestring):
			fs[i] = ''
			continue

		# try to pull in the question number if preceeding
		# These signify aborted oral questions, and are normally
		# useless and at the start of the page.
		# 27. <B> Mr. Steen: </B>
		if i > 0:
			oqnsep = re.findall('^([\s\S]*?)Q?(\d+\.?)(\s*?(?:<stamp aname=".*?"/>)?)$', fs[i-1])
			if oqnsep:
				fs[i-1] = oqnsep[0][0] + oqnsep[0][2]
				boldnamestring = oqnsep[0][1] + ' ' + boldnamestring

		# take out the initial digits and a dot which we may have just put in
		# (although sometimes it would have already been there)
		robj = re.match(r"(\d*\.? )(.*)$", boldnamestring)
		deci = None
		if robj:
			(deci, boldnamestring) = robj.groups()
			# TODO: do something with deci here (it is the "failed
			# oral questions" signifier)

		# see if it is an explicitly bad/ambiguous name which will never match
		if boldnamestring.find('<broken-name>') >= 0:
			id = 'unknown'
			boldnamestring = boldnamestring.replace('<broken-name>', '')
			remadename = ' speakername="%s" error="Name ambiguous in Hansard"' % (boldnamestring)
		else:
			# split bracketed cons out if present
			brakmatch = re.match("(.*)\s+\((.*)\)", boldnamestring)
			if brakmatch:
				(name, cons) = brakmatch.groups()
			else:
				(name, cons) = (boldnamestring, None)

			# match the member to a unique identifier
			(id, remadename, remadecons) = memberList.matchfullnamecons(name, cons, sdate, alwaysmatchcons = False)
			if id and remadename:
				remadename = ' speakername="%s"' % (remadename)
			if not id:
				if remadename == "MultipleMatch":
                                        if boldnamestring == 'Mr. Michael Foster':
                                                if remadecons[1] == 'uk.org.publicwhip/member/1939':
                                                        id = remadecons[1]
                                                        remadename = ' speakername="Michael Foster"'
                                                        remadecons = 'Worcester'
                                                elif remadecons[0] == 'uk.org.publicwhip/member/896':
                                                        id = remadecons[0]
                                                        remadename = ' speakername="Michael Foster"'
                                                        remadecons = 'Worcester'
                                        else:
        					id = 'unknown'
        					remadename = ' speakername="%s" error="MultipleMatch"' % boldnamestring
				elif boldnamestring == 'Jim Dobbin' and sdate == '2014-09-08':
					id = 'uk.org.publicwhip/member/40316'
					remadename = ' speakername="Jim Dobbin"'
				else:
					print "  No name,const match (%s,%s)" % (name, cons)
					raise ContextException("No name match", stamp=stampurl, fragment=boldnamestring)


		# put record in this place
		fs[i] = '<speaker speakerid="%s"%s>%s</speaker>\n' % \
				(id.encode("latin-1"), remadename.encode("latin-1"), boldnamestring)


	# scan through everything and output it into the file
	fout.writelines(fs)

コード例 #28

0

ファイルを表示

ファイル: expenses2007.py プロジェクト: JonathanBowker/parlparse

for line in content:
	line = line.strip()
	if not line or re.match('#', line):
		continue
	cols = line.split("\t")
	name = cols[0]
	m = re.match('(.*?), (.*)$', name)
	name = '%s %s' % (m.group(2), m.group(1))
	money = cols[1:16]
	money = map(lambda x: re.sub("\xa3","", x), money)
	money = map(lambda x: re.sub(",","", x), money)
	id = None
	cons = None
	if name == 'Mr Michael Foster':
		cons = 'Worcester'
	id, name, cons =  memberList.matchfullnamecons(name, cons, yeardate)
	#if not id:
	#	id, name, newcons =  memberList.matchfullnamecons(first + ' ' + last, cons, otheryeardate)
	if not id:
		raise Exception, "Failed to find MP in line %s" % line
	pid = memberList.membertoperson(id)
#	print >>sys.stderr, last, first, money
	if id in expmembers:
		print >>sys.stderr, "Ignored repeated entry for " , id
	else:
		fout.write('<personinfo id="%s" ' % pid)
		for i in [ 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 ]:
			if i==0 or i==1 or i==2 or i==3:
				col = i + 1
			elif i==4: col = '5a'
			elif i==5: col = '5b'

コード例 #29

0

ファイルを表示

ファイル: expenses.py プロジェクト: samknight/parlparse

#    if re.search("Not Found(?i)", content):
#        raise Exception, "Failed to get content in url %s" % test_url

#    matcher = '<TD ALIGN="LEFT" VALIGN="TOP"><A HREF="(/weblink/html/member.html/.*)/log=\d+/pos=\d+" TARGET="_parent"><font face="arial,helvetica" size=2>(.*)/(.*)</A></TD>\s*<TD ALIGN="LEFT" VALIGN="TOP"><font face="arial,helvetica" size=2>(.*)</TD>'
#    matches = re.findall(matcher, content)

	for line in content:
		cols = line.split("\t")
		first = cols[0]
		last = cols[1]
		cons = cols[2]
		money = cols[3:]
		money = map(lambda x: re.sub("\xa3","", x), money)
		money = map(lambda x: re.sub(",","", x), money)
		id, name, cons =  memberList.matchfullnamecons(first + " " + last, cons, yeardate)
		if not id:
			raise Exception, "Failed to find MP %s %s" % (first, last)

		pid = memberList.membertoperson(id)
#		print >>sys.stderr, last, first, money
		if pid in expmembers:
			print >>sys.stderr, "Ignored repeated entry for " , pid
		else:
			fout.write('<personinfo id="%s" ' % pid)
			for i in [ 0,1,2,3,4,5,6,7,8,9 ]:
				if (year=='2004'):
					if (i==7):
						col = '7a'
					elif (i==8 or i==9):
						col = i

コード例 #30

0

ファイルを表示

file.close()

for line in content:
	cols = line.split("\t")
	cons = cols[0]
	money = cols[1:11]
	first = ''
	last = ''
	if (len(cols)>11):
		last = cols[11]
		first = cols[12]
	money = map(lambda x: re.sub("\xa3","", x), money)
	money = map(lambda x: re.sub(",","", x), money)
	id = None
	if first and last:
		id, name, newcons =  memberList.matchfullnamecons(first + ' ' + last, cons, yeardate)
		if not id:
			id, name, newcons =  memberList.matchfullnamecons(first + ' ' + last, cons, otheryeardate)
		cons = newcons
	if not id:
		id, name, cons =  memberList.matchcons(cons, yeardate)
	if not id:
		raise Exception, "Failed to find MP in line %s" % line
	pid = memberList.membertoperson(id)
#	print >>sys.stderr, last, first, money
	if id in expmembers:
		print >>sys.stderr, "Ignored repeated entry for " , id
	else:
		fout.write('<personinfo id="%s" ' % pid)
		for i in [ 0,1,2,3,4,5,6,7,8,9 ]:
			if (i==7):

コード例 #31

0

ファイルを表示

ファイル: division.py プロジェクト: samknight/parlparse

def MpList(fsm, vote, stampurl, sdate):
    # Merge lone listed constituencies onto end of previous line
    newfsm = []
    for fss in fsm:
        if not fss: continue
        if reconstnm.match(fss):
            # print "constnm only %s appending to previous line %s" % (fss, newfsm[-1])
            newfsm[-1] += " " + fss
        else:
            newfsm.append(fss)

    res = []
    pfss = ''

    multimatches = {}  # from tuple to number of matches accounted, and name

    for fss in newfsm:
        #print "fss ", fss

        # break up concattenated lines
        # Beresford, Sir PaulBlunt, Crispin

        while re.search('\S', fss):
            # there was an & in [A-Z] on line below, but it broke up this incorrectly:
            # Simon, Si&#244;n <i>(B'ham Erdington)</i>
            regsep = re.search('(.*?,.*?(?:[a-z]|</i>|\.|\)))([A-Z].*?,.*)$',
                               fss)
            regsep2 = re.match('(.*?,.*?)  ([A-Z].*?,.*)$', fss)
            if regsep and not re.search('  Mc$', regsep.group(1)):
                fssf = regsep.group(1)
                fss = regsep.group(2)
            elif regsep2:
                fssf = regsep2.group(1)
                fss = regsep2.group(2)
            else:
                fssf = fss
                fss = ''

            # check alphabetical - but "rh" and so on confound so don't bother
            #if pfss and (pfss > fssf):
            #	print pfss, fssf
            #	raise Exception, ' out of alphabetical order %s and %s' % (pfss, fssf)
            #pfss = fssf

            # flipround the name
            # Bradley, rh Keith <i>(Withington)</i>
            # Simon, Sio(r)n <i>(Withington)</i>
            #print "fssf ", fssf
            ginp = reflipname.match(fssf)
            if ginp:
                #print "grps ", ginp.groups()
                fnam = '%s %s' % (ginp.group(2), ginp.group(1))
                cons = ginp.group(3)

            # name not being flipped, is firstname lastname
            else:
                ginp = renoflipname.match(fssf)
                if not ginp:
                    raise ContextException(
                        "No flipped or non-flipped name match (division)",
                        stamp=stampurl,
                        fragment=fssf)
                fnam = ginp.group(1)
                cons = ginp.group(2)

            #print "fss ", fssf
            (mpid, remadename,
             remadecons) = memberList.matchfullnamecons(fnam,
                                                        cons,
                                                        sdate,
                                                        alwaysmatchcons=False)
            if not mpid and remadename == "MultipleMatch":
                assert type(remadecons) == tuple  # actually the list of ids
                i = len(multimatches.setdefault(remadecons,
                                                []))  # the index we work with
                if i >= len(remadecons):
                    print "Name", fnam, "used too many times for list", remadecons, "where other instances are", multimatches[
                        remadecons]
                    raise ContextException("Too many instances",
                                           stamp=stampurl,
                                           fragment=fnam)
                mpid = remadecons[i]
                multimatches[remadecons].append(fnam)

                # appears with multiple matching which is ignorable when both ambiguous people vote on same side of a division
                #print "For name", fnam, "returning id", mpid, ";", i, " out of ", remadecons

            elif not mpid and remadename != "MultipleMatch":
                print "division.py: no match for", fnam, cons, sdate
                raise ContextException("No match on name",
                                       stamp=stampurl,
                                       fragment=fnam)
            #print fnam, " --> ", remadename.encode("latin-1")
            res.append('\t<mpname person_id="%s" vote="%s">%s</mpname>' %
                       (mpid, vote, FixHTMLEntities(fssf)))

    # now we have to check if the multimatched names were all exhausted
    for ids in multimatches:
        if len(multimatches[ids]) != len(ids):
            print "Insufficient vote matches on name", multimatches[
                ids], "ids taken to", ids
            raise ContextException("Not enough vote match on ambiguous name",
                                   stamp=stampurl,
                                   fragment=multimatches[ids][0])
    return res

コード例 #32

0

ファイルを表示

    origcons = origcons.replace("Stretford and ~~~~~~~",
                                "Stretford and Urmston")

    # no longer in house - TODO give better date
    if origname == "Dennis Canavan" or origname == "Rt Hon Paul Daisley":
        continue

    if origcons == "South Tomshire":  # better keep rosa's membership of parliament secret
        continue
    if origcons == "Trumpton":  # i didn't know james was religious
        continue
    if origcons == "Stefstown":  # i didn't know stef was knighted
        continue

    try:
        id, name, cons = memberList.matchfullnamecons(origname, origcons,
                                                      date_today)
    except Exception, e:
        print >> sys.stderr, "FaxYourMP name match failed"
        print >> sys.stderr, e
    else:
        if voteside.lower() == "no":
            nohash[id] = nohash.get(id, 0) + int(votecount)
        elif voteside.lower() == "yes" or voteside.lower() == "yes" + chr(160):
            yeshash[id] = yeshash.get(id, 0) + int(votecount)
        else:
            raise Exception, "Strange vote %s" % voteside

ih.close()


def responsiveness(id):

コード例 #33

0

ファイルを表示

ファイル: filter.py プロジェクト: JonathanBowker/parlparse

def RunRegmemFilters2010(fout, text, sdate, sdatever):
    print "2010-? new register of members interests!  Check it is working properly (via mpinfoin.pl) - %s" % sdate

    WriteXMLHeader(fout)
    fout.write("<publicwhip>\n")

    memberset = set()
    text = re.sub('<span class="highlight">([^<]*?)</span>', r'\1', text)
    t = BeautifulStoneSoup(text)
    for page in t('page'):
        title = page.h2.renderContents()
        if title in ('HAGUE, Rt Hon William (Richmond (Yorks)',
                     'PEARCE, Teresa (Erith and Thamesmead'):
            title += ')'
        res = re.search("^([^,]*), ([^(]*) \((.*)\)\s*$", title)
        if not res:
            raise ContextException, "Failed to break up into first/last/cons: %s" % title
        (lastname, firstname, constituency) = res.groups()
        firstname = memberList.striptitles(firstname)[0].decode('utf-8')
        lastname = memberList.lowercaselastname(lastname).decode('utf-8')
        constituency = constituency.decode('utf-8')
        lastname = lastname.replace(u'O\u2019brien', "O'Brien")  # Hmm
        (id, remadename,
         remadecons) = memberList.matchfullnamecons(firstname + " " + lastname,
                                                    constituency, sdate)
        if not id:
            raise ContextException, "Failed to match name %s %s (%s) date %s\n" % (
                firstname, lastname, constituency, sdate)
        fout.write(('<regmem personid="%s" membername="%s" date="%s">\n' %
                    (id, remadename, sdate)).encode("latin-1"))
        memberset.add(id)
        category = None
        categoryname = None
        subcategory = None
        record = False
        for row in page.h2.findNextSiblings():
            text = row.renderContents().decode('utf-8').encode(
                'iso-8859-1', 'xmlcharrefreplace')
            if row.get('class') == 'spacer':
                if record:
                    fout.write('\t\t</record>\n')
                    record = False
                continue
            if not text or re.match('\s*\.\s*$', text): continue
            if text == '<strong>%s</strong>' % title: continue
            if re.match('\s*Nil\.?\s*$', text):
                fout.write('Nil.\n')
                continue
            # Since 2015 election, register is all paragraphs, no headings :(
            if row.name == 'h3' or row.get('class') == 'shd0' or re.match(
                    '<strong>\d+\. ', text):
                if re.match('\s*$', text): continue
                m = re.match(
                    "(?:\s*<strong>)?\s*(\d\d?)\.\s*(.*)(?:</strong>\s*)?$",
                    text)
                if m:
                    if record:
                        fout.write('\t\t</record>\n')
                        record = False
                    if category:
                        fout.write('\t</category>\n')
                    category, categoryname = m.groups()
                    subcategory = None
                    categoryname = re.sub('<[^>]*>(?s)', '',
                                          categoryname).strip()
                    fout.write('\t<category type="%s" name="%s">\n' %
                               (category, categoryname))
                    continue
            if not record:
                fout.write('\t\t<record>\n')
                record = True
            subcategorymatch = re.match("\s*\(([ab])\)\s*(.*)$", text)
            if subcategorymatch:
                subcategory = subcategorymatch.group(1)
                fout.write('\t\t\t(%s)\n' % subcategory)
                fout.write('\t\t\t<item subcategory="%s">%s</item>\n' %
                           (subcategory, subcategorymatch.group(2)))
                continue
            if subcategory:
                fout.write('\t\t\t<item subcategory="%s">%s</item>\n' %
                           (subcategory, text))
            else:
                fout.write('\t\t\t<item>%s</item>\n' % text)
        if record:
            fout.write('\t\t</record>\n')
            record = False
        if category:
            fout.write('\t</category>\n')
        fout.write('</regmem>\n')

    membersetexpect = set(
        [m['person_id'] for m in memberList.mpslistondate(sdate)])

    # check for missing/extra entries
    missing = membersetexpect.difference(memberset)
    if len(missing) > 0:
        print "Missing %d MP entries:\n" % len(missing), missing
    extra = memberset.difference(membersetexpect)
    if len(extra) > 0:
        print "Extra %d MP entries:\n" % len(extra), extra

    fout.write("</publicwhip>\n")

コード例 #34

0

ファイルを表示

ファイル: regmemfilter.py プロジェクト: nrhorner/parlparse

def RunRegmemFilters2010(fout, text, sdate, sdatever):
        print "2010-? new register of members interests!  Check it is working properly (via mpinfoin.pl) - %s" % sdate

        WriteXMLHeader(fout)
	fout.write("<publicwhip>\n")
        
        memberset = set()
        text = re.sub('<span class="highlight">([^<]*?)</span>', r'\1', text)
        t = BeautifulStoneSoup(text)
        for page in t('page'):
                title = page.h2.renderContents()
                res = re.search("^([^,]*), ([^(]*) \((.*)\)\s*$", title)
                if not res:
                        raise ContextException, "Failed to break up into first/last/cons: %s" % title
                (lastname, firstname, constituency) = res.groups()
                firstname = memberList.striptitles(firstname)[0].decode('utf-8')
                lastname = memberList.lowercaselastname(lastname).decode('utf-8')
                constituency = constituency.decode('utf-8')
                lastname = lastname.replace(u'O\u2019brien', "O'Brien") # Hmm
                (id, remadename, remadecons) = memberList.matchfullnamecons(firstname + " " + lastname, constituency, sdate)
                if not id:
                        raise ContextException, "Failed to match name %s %s (%s) date %s\n" % (firstname, lastname, constituency, sdate)
                fout.write(('<regmem personid="%s" memberid="%s" membername="%s" date="%s">\n' % (memberList.membertoperson(id), id, remadename, sdate)).encode("latin-1"))
                memberset.add(id)
                category = None
                categoryname = None
                subcategory = None
                for row in page.h2.findNextSiblings():
                        text = row.renderContents().decode('utf-8').encode('iso-8859-1', 'xmlcharrefreplace')
                        if not text or re.match('\s*\.\s*$', text): continue
                        if re.match('\s*Nil\.?\s*$', text):
                                fout.write('Nil.\n')
                                continue
                        if row.name == 'h3':
                                if re.match('\s*$', text): continue
                                m = re.match("\s*(\d\d?)\.\s*(.*)$", text)
                                if m:
                                        if category:
                                                fout.write('\t</category>\n')
                                        category, categoryname = m.groups()
                                        subcategory = None
                                        fout.write('\t<category type="%s" name="%s">\n' % (category, categoryname))
                                        continue
                        if row.get('class') == 'spacer': continue
                        subcategorymatch = re.match("\s*\(([ab])\)\s*(.*)$", text)
                        if subcategorymatch:
                                subcategory = subcategorymatch.group(1)
                                fout.write('\t\t(%s)\n' % subcategory)
                                fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, subcategorymatch.group(2)))
                                continue
                        if subcategory:
                                fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, text))
                        else:
                                fout.write('\t\t<item>%s</item>\n' % text)
                if category:
                        fout.write('\t</category>\n')
                fout.write('</regmem>\n')                                

        membersetexpect = set(memberList.mpslistondate(sdate))
        
        # check for missing/extra entries
        missing = membersetexpect.difference(memberset)
        if len(missing) > 0:
                print "Missing %d MP entries:\n" % len(missing), missing
        extra = memberset.difference(membersetexpect)
        if len(extra) > 0:
                print "Extra %d MP entries:\n" % len(extra), extra

	fout.write("</publicwhip>\n")

コード例 #35

0

ファイルを表示

ファイル: future-fetch.py プロジェクト: hzj123/theyworkforyou

    def __init__(self, entry):
        self.id = entry.event.attrib['id']
        self.deleted = 0
        self.link_calendar = entry.guid
        self.link_external = entry.link
        chamber = entry.event.chamber.text.strip()
        self.chamber = '%s: %s' % (entry.event.house.text.strip(), chamber)
        self.event_date = entry.event.date.text
        self.time_start = getattr(entry.event, 'startTime', None)
        self.time_end = getattr(entry.event, 'endTime', None)

        committee_text = entry.event.comittee.text
        if committee_text:
            committee_text = committee_text.strip()
            if chamber in ('Select Committee', 'General Committee'):
                self.committee_name = committee_text
            elif committee_text != "Prime Minister's Question Time":
                self.debate_type = committee_text

        self.people = []

        title_text = entry.event.inquiry.text
        if title_text:
            m = re.search(' - ([^-]*)$', title_text)
            if m:
                person_texts = [x.strip() for x in m.group(1).split('/')]

                for person_text in person_texts:
                    id, name, cons = memberList.matchfullnamecons(person_text, None, self.event_date)
                    if not id:
                        try:
                            id = lordsList.GetLordIDfname(person_text, None, self.event_date)
                        except:
                            pass
                    if id:
                        self.people.append(int(memberList.membertoperson(id).replace('uk.org.publicwhip/person/', '')))

                if len(self.people) == len(person_texts):
                    title_text = title_text.replace(' - ' + m.group(1), '')

            self.title = title_text.strip()
        elif committee_text == "Prime Minister's Question Time":
            self.title = committee_text

        self.witnesses = []
        witness_text = entry.event.witnesses.text
        if witness_text == 'This is a private meeting.':
            self.title = witness_text
        elif witness_text:
            self.witnesses_str = witness_text.strip()
            m = re.findall(r'\b(\w+ \w+ MP)', self.witnesses_str)
            for mp in m:
                id, name, cons = memberList.matchfullnamecons(mp, None, self.event_date)
                if not id: continue
                pid = int(memberList.membertoperson(id).replace('uk.org.publicwhip/person/', ''))
                mp_link = '<a href="/mp/?p=%d">%s</a>' % (pid, mp)
                self.witnesses.append(pid)
                self.witnesses_str = self.witnesses_str.replace(mp, mp_link)

        location_text = entry.event.location.text
        if location_text: self.location = location_text.strip()

コード例 #36

0

ファイルを表示

ファイル: faxyourmpfinaladdup.py プロジェクト: JonathanBowker/parlparse

    if made[0:4] != "2004":
        continue
    made_date = mx.DateTime.DateTimeFrom(made).date

    constituency = constituency.replace('\\', '')
    mp_name = mp_name.replace('\\', '')

    if constituency == "South Tomshire": # better keep rosa's membership of parliament secret
        continue
    if constituency == "Trumpton": # i didn't know james was religious
        continue
    if constituency == "Stefstown": # i didn't know stef was knighted
        continue
    
    try:
        mp_id, name, cons =  memberList.matchfullnamecons(mp_name, constituency, made_date)
    except Exception, e:
        print >>sys.stderr, "FaxYourMP name match failed", e
    else:
        if not mp_id:
            print >>sys.stderr, "FaxYourMP name match failed %s, %s" % (mp_name, constituency)
        else:
            id = memberList.membertoperson(mp_id)
            if vote.lower() == "no":
                nohash[id] = nohash.get(id, 0) + 1
            elif vote.lower() == "yes" or vote.lower() == "yes"+chr(160):
                yeshash[id] = yeshash.get(id, 0) + 1
            elif vote == "":
                # print >>sys.stderr, "Blank vote"
                # Ignore for now
                pass