コード例 #1
0
def FilterDebateSections(text, sdate, typ):
    # make the corrections at this level which enables the headings to be resolved.
    # old style fixing (before patches existed)
    if typ == "debate":
        text = ApplyFixSubstitutions(text, sdate, fixsubs)
    else:
        assert typ == "westminhall"
        # this is crap!!!
        text = re.sub('<ul><ul><ul>(?i)', '<ul>', text)
        text = re.sub('</ul></ul></ul>(?i)', '</ul>', text)
        text = re.sub('<h5></h5>(?i)', '', text)

    # split into list of triples of (heading, pre-first speech text, [ (speaker, text) ])
    headspeak = SplitHeadingsSpeakers(text)

    # break down into lists of headings and lists of speeches
    if typ == "debate":
        (ih, stampurl) = StripDebateHeadings(headspeak, sdate)
    elif typ == "westminhall":
        (ih, stampurl) = StripWestminhallHeadings(headspeak, sdate)
    else:
        assert False  # to be for writminstat?

    # loop through each detected heading and the detected partitioning of speeches which follow.
    # this is a flat output of qspeeches, some encoding headings, and some divisions.
    # see the typ variable for the type.
    flatb = []
    state = {}
    #lastheading = None
    chair_head = 0
    for sht in headspeak[ih:]:
        try:
            # triplet of ( heading, unspokentext, [(speaker, text)], major? )
            headingtxt = stampurl.UpdateStampUrl(string.strip(
                sht[0]))  # we're getting stamps inside the headings sometimes
            headingmajor = sht[3]
            if typ == 'debate' and (headingmajor
                                    or sht == headspeak[-1]):  # UGH again
                headingtxt = headingtxt.upper()
            unspoketxt = sht[1]
            speechestxt = sht[2]

            # the heading detection, as a division or a heading speech object
            # detect division headings
            gdiv = re.match('(?:<b>)?Division No. (\d+)(?i)', headingtxt)

            # heading type
            if not gdiv:  # and lastheading != headingtxt:
                qbh = NormalHeadingPart(headingtxt, stampurl, state, typ)
                # print "h ", qbh.typ, qbh.stext

                # ram together minor headings into previous ones which have no speeches
                if qbh.typ == 'minor-heading' and len(
                        flatb) > 0 and flatb[-1].typ == 'minor-heading':
                    flatb[-1].stext.append(" &mdash; ")
                    flatb[-1].stext.extend(qbh.stext)

# ram together major headings into previous ones which have no speeches
                elif qbh.typ == 'major-heading' and len(
                        flatb) > 0 and flatb[-1].typ == 'major-heading':
                    flatb[-1].stext.append(" &mdash; ")
                    flatb[-1].stext.extend(qbh.stext)

                elif qbh.typ == 'minor-heading' and len(flatb) > 0 and flatb[-1].typ == 'major-heading' and \
                    ( re.search('(Allotted|Allocated) Day(?i)', qbh.stext[-1]) or re.search('^Petition$(?i)', flatb[-1].stext[-1]) ):
                    flatb[-1].stext.append(" &mdash; ")
                    flatb[-1].stext.extend(qbh.stext)

                elif re.search(
                        "(?:sitting suspended(?: for| until| till|\.))|(on resuming&)(?i)",
                        qbh.stext[0]):
                    if len(flatb) > 0 and flatb[-1].typ == 'speech':
                        qb = qspeech('nospeaker="true"', qbh.stext[0],
                                     stampurl)
                        qb.typ = 'speech'
                        FilterDebateSpeech(qb)
                        flatb.append(qb)

                elif re.match(
                        "\[.*? in\s*the\s*Chair\.?\]$(?i)", qbh.stext[0]
                ) and len(flatb) > 0 and flatb[-1].typ == 'speech':
                    qb = qspeech('nospeaker="true"', qbh.stext[0], stampurl)
                    qb.typ = 'speech'
                    FilterDebateSpeech(qb)
                    flatb.append(qb)

    # this is where we suck in a trailing "Clause" part of the title that is mistakenly outside the heading.
                elif (qbh.typ == 'minor-heading' or qbh.typ == 'major-heading'
                      ) and len(flatb) > 0 and flatb[-1].typ == 'speech':
                    mmm = re.match(
                        '\s*<p>\s*((?:New )?(?:clause|schedule) \d+\w?)</p>(?i)',
                        flatb[-1].stext[-1])
                    if mmm:
                        if IsNotQuiet():
                            print "Clause/schedule moving", flatb[-1].stext[-1]
                        qbh.stext.insert(0, " &mdash; ")
                        qbh.stext.insert(0, mmm.group(1))
                        flatb[-1].stext = flatb[
                            -1].stext[:-1]  # delete final value

                        # remove an empty speech
                        if not flatb[-1].stext:
                            if IsNotQuiet():
                                print "removing empty speech after moving 'clause/schedule' out"
                            assert flatb[-1].speaker == 'nospeaker="true"'
                            del flatb[-1]

                    # converting a search into a match, for safety, and double checking
                    else:
                        if re.search(
                                '<p>\s*((?:New )?\s*(?:clause|schedule)\s*\w+)\s*</p>(?i)',
                                flatb[-1].stext[-1]):
                            print flatb[-1].stext[-1]
                            assert False

                    flatb.append(qbh)

# otherwise put out this heading
                else:
                    flatb.append(qbh)

            # division case
            elif gdiv:
                (unspoketxt,
                 qbd) = DivisionParsingPart(string.atoi(gdiv.group(1)),
                                            unspoketxt, stampurl, sdate)

                # grab some division text off the back end of the previous speech
                # and wrap into a new no-speaker speech
                qbdp = GrabDivisionProced(flatb[-1], qbd)
                if qbdp:
                    flatb.append(qbdp)
                flatb.append(qbd)

                # write out our file with the report of all divisions
                PreviewDivisionTextGuess(flatb)

#lastheading = headingtxt

            # continue and output unaccounted for unspoken text occuring after a
            # division, or after a heading
            if (not re.match('(?:<[^>]*>|\s)*$', unspoketxt)):
                qb = qspeech('nospeaker="true"', unspoketxt, stampurl)
                qb.typ = 'speech'
                FilterDebateSpeech(qb)
                flatb.append(qb)

            # there is no text; update from stamps if there are any
            else:
                stampurl.UpdateStampUrl(unspoketxt)

            # go through each of the speeches in a block and put it into our batch of speeches
            for ss in speechestxt:
                qb = qspeech(ss[0], ss[1], stampurl)
                qb.typ = 'speech'
                FilterDebateSpeech(qb, bDebateBegToMove=True)

                qbdp = GrabWestminDivisionInterruptProced(
                    qb, ss[1])  # captures tail off westminster hall speeches
                flatb.append(qb)
                if qbdp:
                    flatb.append(qbdp)

        except ContextException, e:
            raise
コード例 #2
0
def FilterDebateSpeakers(fout, text, sdate, typ):

	if typ == "westminhall":
		depspeakerrg = re.search("\[(.*?)(?:<i>)? ?in the Chair(?:</i>)?\]", text)
		if not depspeakerrg:
			raise ContextException("Can't find the [... in the Chair] phrase")
					
		depspeaker = depspeakerrg.group(1)

	# old style fixing (before patches existed)
	if typ == "debate":
		text = ApplyFixSubstitutions(text, sdate, fixsubs)

        # for error messages
	stampurl = StampUrl(sdate)

        # Fix missing bold tags around names
        missingbolds = re.findall('(\n?<p>(?:<stamp aname="[^"]+"/>)+)((?:<b></b>)?\s*)([A-Za-z.\-\s]+)((?:\([^)]*\)\s*)*)(:\s)', text)
        for p1,p2,p3,p4,p5 in missingbolds:
                missingbold = "%s%s%s%s%s" % (p1,p2,p3,p4,p5)
                bold = "%s<b>%s%s%s</b>" % (p1,p3,p4,p5)
                namematches = memberList.fullnametoids(p3, sdate)
                if namematches:
                        if not missingbold in text:
                                print "ERROR: missing bold text found, but then vanished when replacing"
                        text = text.replace(missingbold, bold)

        # Move Urgent Question out of speaker name
        urgentqns = re.findall('(<p>(?:<stamp aname="[^"]+"/>)+)(<b>[^<]*?)(\s*<i>\s*\(Urgent Question\)</i>)(:</b>)(?i)', text)
        for p1,p2,p3,p4 in urgentqns:
                urgentqn = "%s%s%s%s" % (p1,p2,p3,p4)
                correction = "%s%s%s%s" % (p1,p2,p4,p3)
                text = text.replace(urgentqn, correction)

	# setup for scanning through the file.
	for fss in recomb.split(text):
		stampurl.UpdateStampUrl(fss)
                #print fss
                #print "--------------------"

		# division number detection (these get through the speaker detection regexp)
		if redivno.match(fss):
			fout.write(fss.encode("latin-1"))
			continue

		# CORRECTION title (these also get through) -- both these are surrounded by <center> tags usually.
		if fss == "<b>CORRECTION</b>":
			fout.write(fss.encode("latin-1"))
			continue

		# speaker detection
		speakerg = respeakervals.match(fss)
		if speakerg:
			# optional parts of the group
			# we can use oqnum to detect oral questions
			anamestamp = speakerg.group(4) or speakerg.group(3) or ""
			oqnum = speakerg.group(1)
			if speakerg.group(5):
				assert not oqnum
				oqnum = speakerg.group(5)
			if oqnum:
				oqnum = ' oral-qnum="%s"' % oqnum
			else:
				oqnum = ""

			# the preceding square bracket qnums
			sqbnum = speakerg.group(2) or ""

			party = speakerg.group(8) or speakerg.group(10)

			spstr = string.strip(speakerg.group(6))
			spstrbrack = speakerg.group(7) or speakerg.group(9) # the bracketted phrase (sometimes the constituency or name if it is a minister)
                        if spstrbrack:
                                spstrbrack = re.sub("\n", ' ', spstrbrack)

			# do quick substitution for dep speakers in westminster hall
			if typ == "westminhall" and re.search("deputy[ \-]speaker(?i)", spstr) and not spstrbrack:
				#spstrbrack = depspeaker
				spstr = depspeaker

			# match the member to a unique identifier and displayname
			try:
				#print "spstr", spstr, ",", spstrbrack
				#print speakerg.groups()
				result = memberList.matchdebatename(spstr, spstrbrack, sdate, typ)
			except Exception, e:
				# add extra stamp info to the exception
				raise ContextException(str(e), stamp=stampurl, fragment=fss)

			# put record in this place
			#print "ree", result.encode("latin-1")
			spxm = '%s<speaker %s%s>%s</speaker>\n%s' % (anamestamp, result.encode("latin-1"), oqnum, spstr, sqbnum)
			fout.write(spxm)
			continue


		# nothing detected
		# check if we've missed anything obvious
		if recomb.match(fss):
			raise ContextException('regexpvals not general enough', fragment=fss, stamp=stampurl)
		if remarginal.search(fss):
			raise ContextException(' marginal speaker detection case: %s' % remarginal.search(fss).group(0), fragment=fss, stamp=stampurl)

		# this is where we phase in the ascii encoding
		fout.write(fss)
コード例 #3
0
ファイル: filter.py プロジェクト: JonathanBowker/parlparse
def RunRegmemFilters(fout, text, sdate, sdatever):
    if sdate >= '2010-09-01':
        return RunRegmemFilters2010(fout, text, sdate, sdatever)

    # message for cron so I check I'm using this
    print "New register of members interests!  Check it is working properly (via mpinfoin.pl) - %s" % sdate

    text = ApplyFixSubstitutions(text, sdate, fixsubs)

    WriteXMLHeader(fout)
    fout.write("<publicwhip>\n")

    text = re.sub('Rt Shaun', 'Shaun', text)  # Always get his name wrong
    text = re.sub('&#128;', '&#163;',
                  text)  # Always get some pound signs wrong
    rows = re.findall("<TR>(.*)</TR>", text)
    rows = [re.sub("&nbsp;", " ", row) for row in rows]
    rows = [re.sub("<B>|</B>|<BR>|`", "", row) for row in rows]
    rows = [
        re.sub('<span style="background-color: #FFFF00">|</span>', '', row)
        for row in rows
    ]
    rows = [re.sub('<IMG SRC="3lev.gif">', "", row) for row in rows]
    rows = [re.sub("&#173;", "-", row) for row in rows]
    rows = [
        re.sub('\[<A NAME="n\d+"><A HREF="\#note\d+">\d+</A>\]', '', row)
        for row in rows
    ]
    rows = [re.sub('\[<A NAME="n\d+">\d+\]', '', row) for row in rows]

    # Fix incorrect tabling of categories when highlighting is in play
    rows = [
        re.sub('<TD COLSPAN=4>(\d\.) ([^<]*?)</TD>',
               r'<TD>\1</TD><TD COLSPAN=3>\2</TD>', row) for row in rows
    ]
    # split into cells within a row
    rows = [re.findall("<TD.*?>\s*(.*?)\s*</TD>", row) for row in rows]

    memberset = set()
    needmemberend = False
    category = None
    categoryname = None
    subcategory = None
    for row in rows:
        striprow = re.sub('</?[^>]+>', '', "".join(row))
        #print row
        if striprow.strip() == "":
            # There is no text on the row, just tags
            pass
        elif len(row) == 1 and re.match("(?i)(<i>)? +(</i>)?", row[0]):
            # <TR><TD COLSPAN=4>&nbsp;</TD></TR>
            pass
        elif len(row) == 1:
            # <TR><TD COLSPAN=4><B>JACKSON, Robert (Wantage)</B></TD></TR>
            res = re.search("^([^,]*), ([^(]*) \((.*)\)$", row[0])
            if not res:
                print row
                raise ContextException, "Failed to break up into first/last/cons: %s" % row[
                    0]
            (lastname, firstname, constituency) = res.groups()
            constituency = constituency.replace(')', '')
            constituency = constituency.replace('(', '')
            firstname = memberList.striptitles(firstname)[0]

            # Register came out after they stood down
            if (firstname == 'Ian' and lastname == 'GIBSON' and sdate > '2009-06-08') \
                or (firstname == 'Michael' and lastname == 'MARTIN' and sdate > '2009-06-22'):
                check_date = '2009-06-08'
            else:
                check_date = sdate
            (id, remadename, remadecons) = memberList.matchfullnamecons(
                firstname + " " + memberList.lowercaselastname(lastname),
                constituency, check_date)
            if not id:
                raise ContextException, "Failed to match name %s %s (%s) date %s" % (
                    firstname, lastname, constituency, sdate)
            if category:
                fout.write('\t</category>\n')
            if needmemberend:
                fout.write('</regmem>\n')
                needmemberend = False
            fout.write(('<regmem personid="%s" membername="%s" date="%s">\n' %
                        (id, remadename, sdate)).encode("latin-1"))
            memberset.add(id)
            needmemberend = True
            category = None
            categoryname = None
            subcategory = None
        elif len(row) == 2 and row[0] == '' and re.match('Nil\.\.?', row[1]):
            # <TR><TD></TD><TD COLSPAN=3><B>Nil.</B></TD></TR>
            fout.write('Nil.\n')
        elif len(row) == 2 and row[0] != '':
            # <TR><TD><B>1.</B></TD><TD COLSPAN=3><B>Remunerated directorships</B></TD></TR>
            if category:
                fout.write('\t</category>\n')
            digits = row[0]
            category = re.match("\s*(\d\d?)\.$", digits).group(1)
            categoryname = row[1]
            subcategory = None
            fout.write('\t<category type="%s" name="%s">\n' %
                       (category, categoryname))
        elif len(row) == 2 and row[0] == '':
            # <TR><TD></TD><TD COLSPAN=3><B>Donations to the Office of the Leader of the Liberal Democrats received from:</B></TD></TR>
            if subcategory:
                fout.write('\t\t<item subcategory="%s">%s</item>\n' %
                           (subcategory, FixHTMLEntities(row[1])))
            else:
                fout.write('\t\t<item>%s</item>\n' % FixHTMLEntities(row[1]))
        elif len(row) == 3 and row[0] == '' and row[1] == '':
            # <TR><TD></TD><TD></TD><TD COLSPAN=2>19 and 20 September 2002, two days fishing on the River Tay in Scotland as a guest of Scottish Coal. (Registered 3 October 2002)</TD></TR>
            if subcategory:
                fout.write('\t\t<item subcategory="%s">%s</item>\n' %
                           (subcategory, FixHTMLEntities(row[2])))
            else:
                fout.write('\t\t<item>%s</item>\n' % FixHTMLEntities(row[2]))
        elif len(row) == 3 and row[0] == '':
            # <TR><TD></TD><TD><B>(a)</B></TD><TD COLSPAN=2>Smithville Associates; training consultancy.</TD></TR>
            if subcategory:
                fout.write(
                    '\t\t<item subcategory="%s">%s</item>\n' %
                    (subcategory, FixHTMLEntities(row[1] + ' ' + row[2])))
            else:
                fout.write('\t\t<item>%s</item>\n' %
                           FixHTMLEntities(row[1] + ' ' + row[2]))
        elif len(row) == 4 and row[0] == '' and (row[1] == '' or row[1]
                                                 == '<IMG SRC="3lev.gif">'):
            # <TR><TD></TD><TD></TD><TD>(b)</TD><TD>Great Portland Estates PLC</TD></TR>
            subcategorymatch = re.match("\(([ab])\)$", row[2])
            if not subcategorymatch:
                content = FixHTMLEntities(row[2] + " " + row[3])
                if subcategory:
                    fout.write('\t\t<item subcategory="%s">%s</item>\n' %
                               (subcategory, content))
                else:
                    fout.write('\t\t<item>%s</item>\n' % content)
            else:
                subcategory = subcategorymatch.group(1)
                fout.write('\t\t(%s)\n' % subcategory)
                fout.write('\t\t<item subcategory="%s">%s</item>\n' %
                           (subcategory, FixHTMLEntities(row[3])))
        else:
            print row
            raise ContextException, "Unknown row type match, length %d" % (
                len(row))
    if category:
        fout.write('\t</category>\n')
    if needmemberend:
        fout.write('</regmem>\n')
        needmemberend = False

    membersetexpect = set(
        [m['person_id'] for m in memberList.mpslistondate(sdate)])

    # check for missing/extra entries
    missing = membersetexpect.difference(memberset)
    if len(missing) > 0:
        print "Missing %d MP entries:\n" % len(missing), missing
    extra = memberset.difference(membersetexpect)
    if len(extra) > 0:
        print "Extra %d MP entries:\n" % len(extra), extra

    fout.write("</publicwhip>\n")
コード例 #4
0
ファイル: speakers.py プロジェクト: samknight/parlparse
def FilterDebateSpeakers(fout, text, sdate, typ):

	if typ == "westminhall":
		depspeakerrg = re.search("\[(.*?)(?:<i>)? ?in the Chair(?:</i>)?\]", text)
		if not depspeakerrg:
			raise ContextException("Can't find the [... in the Chair] phrase")
					
		depspeaker = depspeakerrg.group(1)

	# old style fixing (before patches existed)
	if typ == "debate":
		text = ApplyFixSubstitutions(text, sdate, fixsubs)

        # for error messages
	stampurl = StampUrl(sdate)

        # Fix missing bold tags around names
        missingbolds = re.findall('(\n?<p>(?:<stamp aname="[^"]+"/>)+)((?:<b></b>)?\s*)([A-Za-z.\-\s]+)((?:\([^)]*\)\s*)*)(:\s)', text)
        for p1,p2,p3,p4,p5 in missingbolds:
                missingbold = "%s%s%s%s%s" % (p1,p2,p3,p4,p5)
                bold = "%s<b>%s%s%s</b>" % (p1,p3,p4,p5)
                namematches = memberList.fullnametoids(p3, sdate)
                if namematches:
                        if not missingbold in text:
                                print "ERROR: missing bold text found, but then vanished when replacing"
                        text = text.replace(missingbold, bold)

        # Move Urgent Question out of speaker name
        urgentqns = re.findall('(<p>(?:<stamp aname="[^"]+"/>\s*)+)(<b>[^<]*?)(\s*<i>\s*\(Urgent Question\)</i>\s*)(:</b>)(?i)', text)
        for p1,p2,p3,p4 in urgentqns:
                urgentqn = "%s%s%s%s" % (p1,p2,p3,p4)
                correction = "%s%s%s%s" % (p1,p2,p4,p3)
                text = text.replace(urgentqn, correction)

	# setup for scanning through the file.
	for fss in recomb.split(text):
		stampurl.UpdateStampUrl(fss)
                #print fss
                #print "--------------------"

		# division number detection (these get through the speaker detection regexp)
		if redivno.match(fss) or retabletext.match(fss):
			fout.write(fss.encode("latin-1"))
			continue

		# CORRECTION title (these also get through) -- both these are surrounded by <center> tags usually.
		if fss == "<b>CORRECTION</b>":
			fout.write(fss.encode("latin-1"))
			continue

                if re.match('<b>(&#8220;)?([0-9]+[A-Z]* .*|(CHAPTER|PART) [0-9]+[A-Z]*|[A-Z, ]+)</b>$', fss):
		        fout.write(fss)
                        continue

		# speaker detection
		speakerg = respeakervals.match(fss)
		if speakerg:
			# optional parts of the group
			# we can use oqnum to detect oral questions
			anamestamp = speakerg.group(4) or speakerg.group(3) or ""
			oqnum = speakerg.group(1)
			if speakerg.group(5):
				assert not oqnum
				oqnum = speakerg.group(5)
			if oqnum:
				oqnum = ' oral-qnum="%s"' % oqnum
			else:
				oqnum = ""

			# the preceding square bracket qnums
			sqbnum = speakerg.group(2) or ""

			party = speakerg.group(8) or speakerg.group(10)

			spstr = string.strip(speakerg.group(6))
			spstrbrack = speakerg.group(7) or speakerg.group(9) # the bracketted phrase (sometimes the constituency or name if it is a minister)
                        if spstrbrack:
                                spstrbrack = re.sub("\n", ' ', spstrbrack)

			# do quick substitution for dep speakers in westminster hall
			if typ == "westminhall" and re.search("deputy[ \-]speaker(?i)", spstr) and not spstrbrack:
				#spstrbrack = depspeaker
				spstr = depspeaker

			# match the member to a unique identifier and displayname
			try:
				#print "spstr", spstr, ",", spstrbrack
				#print speakerg.groups()
				result = memberList.matchdebatename(spstr, spstrbrack, sdate, typ)
			except Exception, e:
				# add extra stamp info to the exception
				raise ContextException(str(e), stamp=stampurl, fragment=fss)

			# put record in this place
			#print "ree", result.encode("latin-1")
			spxm = '%s<speaker %s%s>%s</speaker>\n%s' % (anamestamp, result.encode("latin-1"), oqnum, spstr, sqbnum)
			fout.write(spxm)
			continue


		# nothing detected
		# check if we've missed anything obvious
		if recomb.match(fss):
			raise ContextException('regexpvals not general enough', fragment=fss, stamp=stampurl)
		if remarginal.search(fss):
			raise ContextException(' marginal speaker detection case: %s' % remarginal.search(fss).group(0), fragment=fss, stamp=stampurl)

		# this is where we phase in the ascii encoding
		fout.write(fss)
コード例 #5
0
def FilterWransSpeakers(fout, text, sdate):
	text = ApplyFixSubstitutions(text, sdate, fixsubs)

        # Fix things like this, to put bold in. We use bold below to detect names, but
        # occasionally the reporters miss it out, and we catch such cases here:
        # <p><a name="qnpa_0">Caroline Flint: This information is not held centrally. </p>
        # <p><a name="qnpa_15">Ms Harman: The information can be found in the following table. </p>
        missingbolds = re.findall('(\n?<p>(?:<stamp aname="[^"]+"/>)+)((?:<b></b>)?\s*)([A-Za-z.\-\s]+)(:\s)', text)
        for p1,p2,p3,p4 in missingbolds:
                missingbold = "%s%s%s%s" % (p1,p2,p3,p4)
                bold = "%s<b>%s%s</b>" % (p1,p3,p4)
                namematches = memberList.fullnametoids(p3, sdate)
                # Only fix if we found a matching name in the middle (and do it even if ambiguous)
                if namematches:
                        #print "Fixing missing bold, had name matches:\n\t%s\n\t%s" % (missingbold.strip(), bold.strip())
                        if not missingbold in text:
                                print "ERROR: missing bold text found, but then vanished when replacing"
                        text = text.replace(missingbold, bold)
                #else:
                        #print "Plausible missing bold not fixed, as no name matches:\n\t%s\n\t%s" % (missingbold.strip(), bold.strip())

	# <B> Mrs. Iris Robinson: </B>
	lspeakerregexp = '<b>.*?</b>(?:\s*:)?'
	ltableregexp = '<table[^>]*>[\s\S]*?</table>'	# these have bolds, so must be separated out
	tableregexp = ltableregexp + '(?i)'

	lregexp = '(%s|%s)(?i)' % (ltableregexp, lspeakerregexp)

	# setup for scanning through the file.
	fs = re.split(lregexp, text)

        # for error messages
	stampurl = StampUrl(sdate)


	for i in range(len(fs)):
		fss = fs[i]
		fss = stampurl.UpdateStampUrl(fss) # Speakers have new stamps in them

		if re.match(tableregexp, fss):
			continue

		speakerg = re.findall('<b>\s*([^:]*)[:\s]*?([^<:]*)</b>(?i)', fss)
		if not speakerg:
			continue

		# we have a string in bold
		boldnamestring = string.strip(speakerg[0][0])

		# trailing text after the colon in the bold speech bit
		if re.search('\S', speakerg[0][1]):
			fs[i+1] = speakerg[0][1] + fs[i+1]


		# push the square brackets outside of the boldstring if there is one
		# <B> Mr. Miliband [ </B> <i>holding answer 24 March</i>]:
		sqb = re.findall('^([^\[]*)(\[.*)$', boldnamestring)
		if sqb:
			boldnamestring = string.strip(sqb[0][0])
			fs[i+1] = sqb[0][1] + fs[i+1]

		# get rid of blank bold strings
		if not re.search('\S', boldnamestring):
			fs[i] = ''
			continue

		# try to pull in the question number if preceeding
		# These signify aborted oral questions, and are normally
		# useless and at the start of the page.
		# 27. <B> Mr. Steen: </B>
		if i > 0:
			oqnsep = re.findall('^([\s\S]*?)Q?(\d+\.?)(\s*?(?:<stamp aname=".*?"/>)?)$', fs[i-1])
			if oqnsep:
				fs[i-1] = oqnsep[0][0] + oqnsep[0][2]
				boldnamestring = oqnsep[0][1] + ' ' + boldnamestring

		# take out the initial digits and a dot which we may have just put in
		# (although sometimes it would have already been there)
		robj = re.match(r"(\d*\.? )(.*)$", boldnamestring)
		deci = None
		if robj:
			(deci, boldnamestring) = robj.groups()
			# TODO: do something with deci here (it is the "failed
			# oral questions" signifier)

		# see if it is an explicitly bad/ambiguous name which will never match
		if boldnamestring.find('<broken-name>') >= 0:
			id = 'unknown'
			boldnamestring = boldnamestring.replace('<broken-name>', '')
			remadename = ' speakername="%s" error="Name ambiguous in Hansard"' % (boldnamestring)
		else:
			# split bracketed cons out if present
			brakmatch = re.match("(.*)\s+\((.*)\)", boldnamestring)
			if brakmatch:
				(name, cons) = brakmatch.groups()
			else:
				(name, cons) = (boldnamestring, None)

			# match the member to a unique identifier
			(id, remadename, remadecons) = memberList.matchfullnamecons(name, cons, sdate, alwaysmatchcons = False)
			if id and remadename:
				remadename = ' speakername="%s"' % (remadename)
			if not id:
				if remadename == "MultipleMatch":
                                        if boldnamestring == 'Mr. Michael Foster':
                                                if remadecons[1] == 'uk.org.publicwhip/member/1939':
                                                        id = remadecons[1]
                                                        remadename = ' speakername="Michael Foster"'
                                                        remadecons = 'Worcester'
                                                elif remadecons[0] == 'uk.org.publicwhip/member/896':
                                                        id = remadecons[0]
                                                        remadename = ' speakername="Michael Foster"'
                                                        remadecons = 'Worcester'
                                        else:
        					id = 'unknown'
        					remadename = ' speakername="%s" error="MultipleMatch"' % boldnamestring
				elif boldnamestring == 'Jim Dobbin' and sdate == '2014-09-08':
					id = 'uk.org.publicwhip/member/40316'
					remadename = ' speakername="Jim Dobbin"'
				else:
					print "  No name,const match (%s,%s)" % (name, cons)
					raise ContextException("No name match", stamp=stampurl, fragment=boldnamestring)


		# put record in this place
		fs[i] = '<speaker speakerid="%s"%s>%s</speaker>\n' % \
				(id.encode("latin-1"), remadename.encode("latin-1"), boldnamestring)


	# scan through everything and output it into the file
	fout.writelines(fs)
コード例 #6
0
ファイル: colnum.py プロジェクト: samknight/parlparse
def FilterWransColnum(fout, text, sdate):
    # Legacy individual substitution rules
    text = ApplyFixSubstitutions(text, sdate, fixsubs)

    # Remove junk
    text = text.replace("{**con**}{**/con**}", "")

    stamp = StampUrl(sdate)  # for error messages

    colnum = -1
    for fss in recomb.split(text):
        columng = recolumnumvals.match(fss)
        if columng:
            ldate = mx.DateTime.DateTimeFrom(columng.group(1)).date
            if sdate != ldate:
                raise ContextException("Column date disagrees %s -- %s" %
                                       (sdate, fss),
                                       fragment=fss,
                                       stamp=stamp)

            lcolnum = string.atoi(columng.group(2))
            if (colnum == -1) or (lcolnum == colnum + 1):
                pass  # good
            elif lcolnum < colnum:
                raise ContextException("Colnum not incrementing %d -- %s" %
                                       (lcolnum, fss),
                                       fragment=fss,
                                       stamp=stamp)
            # column numbers do get skipped during division listings

            colnum = lcolnum
            stamp.stamp = '<stamp coldate="%s" colnum="%sW"/>' % (sdate,
                                                                  lcolnum)
            fout.write(' ')
            fout.write(stamp.stamp)
            continue

        columncontg = recolnumcontvals.match(fss)
        if columncontg:
            ldate = columncontg.group(1) or columncontg.group(3) or None
            lcolnum = columncontg.group(2) or columncontg.group(4) or None
            if ldate:
                ldate = mx.DateTime.DateTimeFrom(ldate).date
                if sdate != ldate:
                    raise ContextException(
                        "Cont column date disagrees %s -- %s" % (sdate, fss),
                        fragment=fss,
                        stamp=stamp)
                lcolnum = string.atoi(lcolnum)
                if colnum != lcolnum and sdate < '2006-05-08':
                    raise ContextException(
                        "Cont column number disagrees %d -- %s" %
                        (colnum, fss),
                        fragment=fss,
                        stamp=stamp)

                # no need to output anything
                fout.write(' ')
                continue
            if columncontg.group(5):
                lcolnum = string.atoi(columncontg.group(5))
                if colnum != lcolnum and colnum != lcolnum + 1:
                    raise ContextException(
                        "Cont column number disagrees %d -- %s" %
                        (colnum, fss),
                        fragment=fss,
                        stamp=stamp)
                fout.write(' ')
                continue
            if columncontg.group(6):
                lcolnum = string.atoi(columncontg.group(6))
                if colnum + 1 != lcolnum:
                    raise ContextException(
                        "Cont column number disagrees %d -- %s" %
                        (colnum, fss),
                        fragment=fss,
                        stamp=stamp)
                colnum = lcolnum
                stamp.stamp = '<stamp coldate="%s" colnum="%sW"/>' % (sdate,
                                                                      lcolnum)
                fout.write(' ')
                fout.write(stamp.stamp)
                continue

        # anchor names from HTML <a name="xxx">
        anameg = reanamevals.match(fss)
        if anameg:
            aname = anameg.group(1)
            stamp.aname = '<stamp aname="%s"/>' % aname
            fout.write(stamp.aname)
            continue

        # nothing detected
        # check if we've missed anything obvious
        if recomb.match(fss):
            raise ContextException('regexpvals not general enough',
                                   fragment=fss,
                                   stamp=stamp)
# Removed FAI 2007-05-25, I really don't care!
        #if remarginal.search(fss):
        #	raise ContextException('marginal colnum detection case',
        #	        fragment=remarginal.search(fss).group(0),
        #                stamp=stamp)

        fout.write(fss)
コード例 #7
0
ファイル: coltime.py プロジェクト: samknight/parlparse
def FilterDebateColTime(fout, text, sdate, typ):
    # old style fixing (before patches existed)
    if typ == "debate":
        text = ApplyFixSubstitutions(text, sdate, fixsubs)

    stamp = StampUrl(sdate)  # for error messages
    btodaytype = re.match('<pagex [^>]*type="today"', text)
    if btodaytype:
        fout.write('<stamp colnum="000"/>\n')

    colnum = -1
    previoustime = []
    for fss in recomb.split(text):
        # column number type
        columng = recolumnumvals.match(fss)
        if columng:
            assert not btodaytype  # no columns in today

            # check date
            ldate = mx.DateTime.DateTimeFrom(columng.group(1)).date
            if sdate != ldate:
                raise ContextException("Column date disagrees %s -- %s" %
                                       (sdate, fss),
                                       stamp=stamp,
                                       fragment=fss)

            # check number
            lcolnum = string.atoi(columng.group(2))
            if lcolnum == colnum - 1:
                pass  # spurious decrementing of column number stamps
            elif (colnum == -1) or (lcolnum == colnum + 1):
                pass  # good
            # column numbers do get skipped during division listings
            elif lcolnum < colnum:
                raise ContextException(
                    "Colnum not incrementing %d smaller than %d -- %s" %
                    (lcolnum, colnum, fss),
                    stamp=stamp,
                    fragment=fss)

            # write a column number stamp (has to increase no matter what)
            if lcolnum > colnum:
                colnum = lcolnum
                stamp.stamp = '<stamp coldate="%s" colnum="%sW"/>' % (sdate,
                                                                      lcolnum)
            fout.write('<stamp coldate="%s" colnum="%s"/>' % (sdate, colnum))
            continue

        columncg = recolnumcontvals.match(fss)
        if columncg:
            ldate = mx.DateTime.DateTimeFrom(columncg.group(1)).date
            if sdate != ldate:
                raise ContextException("Column date disagrees %s -- %s" %
                                       (sdate, fss),
                                       stamp=stamp,
                                       fragment=fss)

            lcolnum = string.atoi(columncg.group(2))
            if colnum != lcolnum and sdate < '2006-05-08':
                raise ContextException(
                    "Cont column number disagrees %d -- %s" % (colnum, fss),
                    stamp=stamp,
                    fragment=fss)

            continue

        timeg = retimevals.match(fss)
        if timeg:
            time = TimeProcessing(timeg.group(1), previoustime,
                                  (timeg.group(0)[0] == '['), stamp)
            if not time:
                raise ContextException("Time not matched: " + timeg.group(1),
                                       stamp=stamp,
                                       fragment=fss)

            fout.write('<stamp time="%s"/>' % time)
            previoustime.append(time)
            continue

        # anchor names from HTML <a name="xxx">
        anameg = reanamevals.match(fss)
        if anameg:
            aname = anameg.group(1)
            stamp.aname = '<stamp aname="%s"/>' % aname
            fout.write('<stamp aname="%s"/>' % aname)
            continue

        # nothing detected
        # check if we've missed anything obvious
        if recomb.match(fss):
            print "$$$", fss, "$$$"
            print regcolnumcont
            print re.match(regcolnumcont + "(?i)", fss)
            raise ContextException('regexpvals not general enough',
                                   stamp=stamp,
                                   fragment=fss)
        if remarginal.search(fss):
            print fss
            print '--------------------------------\n'
            print "marginal found: ", remarginal.search(fss).groups()
            print "zeroth: ", remarginal.search(fss).group(0)
            print '--------------------------------\n'
            raise ContextException('marginal coltime/a detection case',
                                   stamp=stamp,
                                   fragment=fss)
        fout.write(fss)
コード例 #8
0
ファイル: colnum.py プロジェクト: henare/parlparse
def FilterWransColnum(fout, text, sdate):
    # Legacy individual substitution rules
    text = ApplyFixSubstitutions(text, sdate, fixsubs)

    # Remove junk
    text = text.replace("{**con**}{**/con**}", "")

    stamp = StampUrl(sdate)  # for error messages

    colnum = -1
    for fss in recomb.split(text):
        columng = recolumnumvals.match(fss)
        if columng:
            ldate = mx.DateTime.DateTimeFrom(columng.group(1)).date
            if sdate != ldate:
                raise ContextException("Column date disagrees %s -- %s" % (sdate, fss), fragment=fss, stamp=stamp)

            lcolnum = string.atoi(columng.group(2))
            if (colnum == -1) or (lcolnum == colnum + 1):
                pass  # good
            elif lcolnum < colnum:
                raise ContextException("Colnum not incrementing %d -- %s" % (lcolnum, fss), fragment=fss, stamp=stamp)
                # column numbers do get skipped during division listings

            colnum = lcolnum
            stamp.stamp = '<stamp coldate="%s" colnum="%sW"/>' % (sdate, lcolnum)
            fout.write(" ")
            fout.write(stamp.stamp)
            continue

        columncontg = recolnumcontvals.match(fss)
        if columncontg:
            ldate = columncontg.group(1) or columncontg.group(3) or None
            lcolnum = columncontg.group(2) or columncontg.group(4) or None
            if ldate:
                ldate = mx.DateTime.DateTimeFrom(ldate).date
                if sdate != ldate:
                    raise ContextException(
                        "Cont column date disagrees %s -- %s" % (sdate, fss), fragment=fss, stamp=stamp
                    )
                lcolnum = string.atoi(lcolnum)
                if colnum != lcolnum and sdate < "2006-05-08":
                    raise ContextException(
                        "Cont column number disagrees %d -- %s" % (colnum, fss), fragment=fss, stamp=stamp
                    )

                    # no need to output anything
                fout.write(" ")
                continue
            if columncontg.group(5):
                lcolnum = string.atoi(columncontg.group(5))
                if colnum != lcolnum and colnum != lcolnum + 1:
                    raise ContextException(
                        "Cont column number disagrees %d -- %s" % (colnum, fss), fragment=fss, stamp=stamp
                    )
                fout.write(" ")
                continue
            if columncontg.group(6):
                lcolnum = string.atoi(columncontg.group(6))
                if colnum + 1 != lcolnum:
                    raise ContextException(
                        "Cont column number disagrees %d -- %s" % (colnum, fss), fragment=fss, stamp=stamp
                    )
                colnum = lcolnum
                stamp.stamp = '<stamp coldate="%s" colnum="%sW"/>' % (sdate, lcolnum)
                fout.write(" ")
                fout.write(stamp.stamp)
                continue

        # anchor names from HTML <a name="xxx">
        anameg = reanamevals.match(fss)
        if anameg:
            aname = anameg.group(1)
            stamp.aname = '<stamp aname="%s"/>' % aname
            fout.write(stamp.aname)
            continue

            # nothing detected
            # check if we've missed anything obvious
        if recomb.match(fss):
            raise ContextException("regexpvals not general enough", fragment=fss, stamp=stamp)
            # Removed FAI 2007-05-25, I really don't care!
            # if remarginal.search(fss):
            # 	raise ContextException('marginal colnum detection case',
            # 	        fragment=remarginal.search(fss).group(0),
            #                stamp=stamp)

        fout.write(fss)
コード例 #9
0
ファイル: sections.py プロジェクト: samknight/parlparse
def FilterWransSections(text, sdate, lords=False):
    text = ApplyFixSubstitutions(text, sdate, fixsubs)
    headspeak = SplitHeadingsSpeakers(text)

    # break down into lists of headings and lists of speeches
    (ih, stampurl) = StripWransHeadings(headspeak, sdate)

    # full list of question batches
    # We create a list of lists of speeches
    flatb = []
    justhadnewtitle = False  # For when they put another "Written Answers to Questions" and date
    for sht in headspeak[ih:]:
        # triplet of ( heading, unspokentext, [(speaker, text)] )
        headingtxt = stampurl.UpdateStampUrl(string.strip(
            sht[0]))  # we're getting stamps inside the headings sometimes
        unspoketxt = sht[1]
        speechestxt = sht[2]

        # update the stamps from the pre-spoken text
        if (not re.match('(?:<[^>]*>|\s)*$', unspoketxt)):
            raise ContextException("unspoken text under heading in wrans",
                                   stamp=stampurl,
                                   fragment=unspoketxt)
        stampurl.UpdateStampUrl(unspoketxt)

        # headings become one unmarked paragraph of text

        # detect if this is a major heading
        if not re.search('[a-z]', headingtxt) and not speechestxt:
            if not parlPhrases.wransmajorheadings.has_key(headingtxt):
                raise ContextException(
                    "unrecognized major heading, please add to parlPhrases.wransmajorheadings (a)",
                    fragment=headingtxt,
                    stamp=stampurl)
            majheadingtxtfx = parlPhrases.wransmajorheadings[
                headingtxt]  # no need to fix since text is from a map.
            qbH = qspeech('nospeaker="true"', majheadingtxtfx, stampurl)
            qbH.typ = 'major-heading'
            qbH.stext = [majheadingtxtfx]
            flatb.append(qbH)
            continue
        elif not speechestxt and sdate > '2006-05-07':
            if headingtxt == 'Written Answers to Questions':
                justhadnewtitle = True
                continue
            if not parlPhrases.wransmajorheadings.has_key(headingtxt.upper()):
                if justhadnewtitle:
                    justhadnewtitle = False
                    continue
                raise ContextException(
                    "unrecognized major heading, please add to parlPhrases.wransmajorheadings (b)",
                    fragment=headingtxt,
                    stamp=stampurl)
            majheadingtxtfx = parlPhrases.wransmajorheadings[
                headingtxt.upper()]  # no need to fix since text is from a map.
            qbH = qspeech('nospeaker="true"', majheadingtxtfx, stampurl)
            qbH.typ = 'major-heading'
            qbH.stext = [majheadingtxtfx]
            flatb.append(qbH)
            justhadnewtitle = False
            continue
        elif not speechestxt:
            raise ContextException('broken heading %s' % headingtxt,
                                   stamp=stampurl,
                                   fragment=headingtxt)

        # non-major heading; to a question batch
        if parlPhrases.wransmajorheadings.has_key(headingtxt):
            raise Exception, ' speeches found in major heading %s' % headingtxt

        headingtxtfx = FixHTMLEntities(headingtxt)
        headingmark = 'nospeaker="true"'
        bNextStartofQ = True

        # go through each of the speeches in a block and put it into our batch of speeches
        qnums = []  # used to account for spurious qnums seen in answers
        for ss in speechestxt:
            qb = qspeech(ss[0], ss[1], stampurl)
            #print ss[0] + "  " + stampurl.stamp
            lqnums = re.findall('\[(?:HL)?(\d+)R?\]', ss[1])

            # question posed
            if re.match('(?:<[^>]*?>|\s)*?(to ask|asked (Her Majesty(&#039;|&#146;|\')s Government|the ))(?i)', qb.text) or \
                                    re.search('<wrans-question>', qb.text):
                qb.text = qb.text.replace('<wrans-question>', '')
                qb.typ = 'ques'

                # put out the heading for this question-reply block.
                # we don't assert true since we can have multiple questions answsered in a block.
                if bNextStartofQ:
                    # put out a heading
                    # we need to make the heading of from the same stampurl as the first question
                    qbh = qspeech(headingmark, headingtxtfx, qb.sstampurl)
                    qbh.typ = 'minor-heading'
                    qbh.stext = [headingtxtfx]
                    flatb.append(qbh)

                    bNextStartofQ = False

                    # used to show that the subsequent headings in this block have been created,
                    # and weren't in the original text.
                    headingmark = 'nospeaker="true" inserted-heading="true"'
                    qnums = lqnums  # reset the qnums count
                else:
                    qnums.extend(lqnums)

                qb.stext = FilterQuestion(qb, sdate, lords)
                if not lqnums:
                    errmess = ' <p class="error">Question number missing in Hansard, possibly truncated question.</p> '
                    qb.stext.append(errmess)

                flatb.append(qb)

            # do the reply
            else:
                if bNextStartofQ:
                    raise ContextException('start of question expected',
                                           stamp=qb.sstampurl,
                                           fragment=qb.text)
                qb.typ = 'reply'

                # this case is so rare we flag them in the corrections of the html with this tag
                if re.search("\<another-answer-to-follow\>", qb.text):
                    qb.text = qb.text.replace("<another-answer-to-follow>", "")
                else:
                    bNextStartofQ = True

                # check against qnums which are sometimes repeated in the answer code
                # Don't care if qnum is given in an answer!
                #for qn in lqnums:
                #	# sometimes [n] is an enumeration or part of a title
                #	nqn = string.atoi(qn)
                #	if (not qnums.count(qn)) and (nqn > 100) and ((nqn < 1900) or (nqn > 2010)):
                #		if qb.text.find("<ok-extra-qnum>") >= 0:
                #			qb.text = qb.text.replace("<ok-extra-qnum>", "", 1)
                #		else:
                #			raise ContextException('unknown qnum %s present in answer, make it clear' % qn, stamp = qb.sstampurl, fragment = qb.text)
                qb.stext = FilterReply(qb)
                flatb.append(qb)

        if not bNextStartofQ:
            print speechestxt
            # Note - not sure if this should be speechestxt[-1][1] here.  Does what I want for now...
            raise ContextException("missing answer to question",
                                   stamp=stampurl,
                                   fragment=speechestxt[-1][1])

    # we now have everything flattened out in a series of speeches,
    # where some of the speeches are headings (inserted and otherwise).
    return flatb
コード例 #10
0
def FilterWransSpeakers(fout, text, sdate):
    text = ApplyFixSubstitutions(text, sdate, fixsubs)

    # Fix things like this, to put bold in. We use bold below to detect names, but
    # occasionally the reporters miss it out, and we catch such cases here:
    # <p><a name="qnpa_0">Caroline Flint: This information is not held centrally. </p>
    # <p><a name="qnpa_15">Ms Harman: The information can be found in the following table. </p>
    missingbolds = re.findall(
        '(\n?<p>(?:<stamp aname="[^"]+"/>)+)((?:<b></b>)?\s*)([A-Za-z.\-\s]+)(:\s)',
        text)
    for p1, p2, p3, p4 in missingbolds:
        missingbold = "%s%s%s%s" % (p1, p2, p3, p4)
        bold = "%s<b>%s%s</b>" % (p1, p3, p4)
        namematches = memberList.fullnametoids(p3, sdate)
        # Only fix if we found a matching name in the middle (and do it even if ambiguous)
        if namematches:
            #print "Fixing missing bold, had name matches:\n\t%s\n\t%s" % (missingbold.strip(), bold.strip())
            if not missingbold in text:
                print "ERROR: missing bold text found, but then vanished when replacing"
            text = text.replace(missingbold, bold)
        #else:
        #print "Plausible missing bold not fixed, as no name matches:\n\t%s\n\t%s" % (missingbold.strip(), bold.strip())

    # <B> Mrs. Iris Robinson: </B>
    lspeakerregexp = '<b>.*?</b>(?:\s*:)?'
    ltableregexp = '<table[^>]*>[\s\S]*?</table>'  # these have bolds, so must be separated out
    tableregexp = ltableregexp + '(?i)'

    lregexp = '(%s|%s)(?i)' % (ltableregexp, lspeakerregexp)

    # setup for scanning through the file.
    fs = re.split(lregexp, text)

    # for error messages
    stampurl = StampUrl(sdate)

    for i in range(len(fs)):
        fss = fs[i]
        fss = stampurl.UpdateStampUrl(fss)  # Speakers have new stamps in them

        if re.match(tableregexp, fss):
            continue

        speakerg = re.findall('<b>\s*([^:]*)[:\s]*?([^<:]*)</b>(?i)', fss)
        if not speakerg:
            continue

        # we have a string in bold
        boldnamestring = string.strip(speakerg[0][0])

        # trailing text after the colon in the bold speech bit
        if re.search('\S', speakerg[0][1]):
            fs[i + 1] = speakerg[0][1] + fs[i + 1]

        # push the square brackets outside of the boldstring if there is one
        # <B> Mr. Miliband [ </B> <i>holding answer 24 March</i>]:
        sqb = re.findall('^([^\[]*)(\[.*)$', boldnamestring)
        if sqb:
            boldnamestring = string.strip(sqb[0][0])
            fs[i + 1] = sqb[0][1] + fs[i + 1]

        # get rid of blank bold strings
        if not re.search('\S', boldnamestring):
            fs[i] = ''
            continue

        # try to pull in the question number if preceding
        # These signify aborted oral questions, and are normally
        # useless and at the start of the page.
        # 27. <B> Mr. Steen: </B>
        if i > 0:
            oqnsep = re.findall(
                '^([\s\S]*?)Q?(\d+\.?)(\s*?(?:<stamp aname=".*?"/>)?)$',
                fs[i - 1])
            if oqnsep:
                fs[i - 1] = oqnsep[0][0] + oqnsep[0][2]
                boldnamestring = oqnsep[0][1] + ' ' + boldnamestring

        # take out the initial digits and a dot which we may have just put in
        # (although sometimes it would have already been there)
        robj = re.match(r"(\d*\.? )(.*)$", boldnamestring)
        deci = None
        if robj:
            (deci, boldnamestring) = robj.groups()
            # TODO: do something with deci here (it is the "failed
            # oral questions" signifier)

        # see if it is an explicitly bad/ambiguous name which will never match
        if boldnamestring.find('<broken-name>') >= 0:
            person_id = 'unknown'
            boldnamestring = boldnamestring.replace('<broken-name>', '')
            remadename = ' speakername="%s" error="Name ambiguous in Hansard"' % (
                boldnamestring)
        else:
            # split bracketed cons out if present
            brakmatch = re.match("(.*)\s+\((.*)\)", boldnamestring)
            if brakmatch:
                (name, cons) = brakmatch.groups()
            else:
                (name, cons) = (boldnamestring, None)

            # match the member to a unique identifier
            (person_id, remadename,
             remadecons) = memberList.matchfullnamecons(name,
                                                        cons,
                                                        sdate,
                                                        alwaysmatchcons=False)
            if person_id and remadename:
                remadename = ' speakername="%s"' % (remadename)
            if not person_id:
                if remadename == "MultipleMatch":
                    if boldnamestring == 'Mr. Michael Foster':
                        if remadecons[0] == 'uk.org.publicwhip/person/10209':
                            person_id = remadecons[0]
                            remadename = ' speakername="Michael Foster"'
                            remadecons = 'Worcester'
                    else:
                        person_id = 'unknown'
                        remadename = ' speakername="%s" error="MultipleMatch"' % boldnamestring
                elif boldnamestring == 'Jim Dobbin' and sdate == '2014-09-08':
                    person_id = 'uk.org.publicwhip/person/10170'
                    remadename = ' speakername="Jim Dobbin"'
                else:
                    print "  No name,const match (%s,%s)" % (name, cons)
                    raise ContextException("No name match",
                                           stamp=stampurl,
                                           fragment=boldnamestring)

        # put record in this place
        fs[i] = '<speaker person_id="%s"%s>%s</speaker>\n' % \
          (person_id.encode("latin-1"), remadename.encode("latin-1"), boldnamestring)

    # scan through everything and output it into the file
    fout.writelines(fs)