Exemple #1
def StripWransHeadings(headspeak, sdate):
    # check and strip the first two headings in as much as they are there
    i = 0
    if (headspeak[i][0] != 'Initial') or headspeak[i][2]:
        print headspeak[0]
        raise ContextException('non-conforming Initial heading ')
    i += 1

    # import pdb;pdb.set_trace()
    if (not re.match(
            '(?:<stamp aname="[^"]*"/>)*written answers?(?: to questions?)?(?i)',
            headspeak[i][0])) or headspeak[i][2]:
        if not re.match('The following answers were received.*',
# print headspeak[i]
        i += 1

    givendate = string.replace(headspeak[i][0], "&nbsp;", " ")
    givendate = re.sub("</?i>", "", givendate)

    gd = re.match('(?:<stamp aname="[^"]*"/>)*(.*)$', givendate)
    if gd:
        givendate = gd.group(1)
    if (not re.match('(?i)(?:<stamp[^>]*>)*(?:<i>)?\s*(?:The following answers were|Answers) received.*', headspeak[i][0]) and
           not re.match('(?:<stamp[^>]*>)?The following question was answered on.*', headspeak[i][0]) and \
     (sdate != mx.DateTime.DateTimeFrom(givendate).date)) or headspeak[i][2]:
        if (not parlPhrases.wransmajorheadings.has_key(
                headspeak[i][0])) or headspeak[i][2]:
            print headspeak[i]
            raise ContextException('non-conforming second heading',
        i += 1

    # find the url and colnum stamps that occur before anything else
    stampurl = StampUrl(sdate)
    for j in range(0, i):

# Later editions seem to miss first column number, sigh
    if not stampurl.stamp:
        for speeches in headspeak:
            text = ''.join([speech[1] for speech in speeches[2]])
            m = re.search('colnum="(\d+)W"', text)
            if m:
                stampurl.UpdateStampUrl('<stamp coldate="%s" colnum="%dW"/>' %
                                        (sdate, int(m.group(1)) - 1))

    if not stampurl.stamp or not stampurl.pageurl or not stampurl.aname:
        raise ContextException('missing stamp url at beginning of file')
    return (i, stampurl)
Exemple #2
def StripWestminhallHeadings(headspeak, sdate):
    # check and strip the first two headings in as much as they are there
    ih = 0
    ih = StripDebateHeading('Initial', ih, headspeak)

    # Westminster Hall
    ih = StripDebateHeading('westminster hall(?i)', ih, headspeak)

    # date line
    givendate = re.sub('</?i>', ' ', headspeak[ih][0])
    gd = re.match('(?:<stamp aname="[^"]*"/>)*(.*)$(?i)', givendate)
    if gd:
        givendate = gd.group(1)
    if ((sdate !=
         mx.DateTime.DateTimeFrom(givendate).date)) or headspeak[ih][2]:
        raise Exception, 'date heading %s mismatches with date %s' % (repr(
            headspeak[ih]), sdate)
    ih = ih + 1

    # next line is:
    # <H3><center>[Mr. John McWilliam in the Chair]</center></H3>
    # but we leave it as a title.

    # find the url, colnum and time stamps that occur before anything else in the unspoken text
    stampurl = StampUrl(sdate)
    stampurl.timestamp = '<stamp time="%s"/>' % "unknown"

    for j in range(0, ih):

    if (not stampurl.stamp) or (not stampurl.pageurl):
        raise Exception, ' missing stamp url at beginning of file '
    return (ih, stampurl)
Exemple #3
def FilterWMSSpeakers(fout, text, sdate):
        stampurl = StampUrl(sdate)

        for fss in recomb.split(text):

                # speaker detection
                speakerg = respeakervals.match(fss)
                if speakerg:
                        anamestamp = speakerg.group(1) or speakerg.group(2) or ""
                        spstr = string.strip(speakerg.group(3))
                        spstrbrack = speakerg.group(4)
                        if not spstr:
                                #print "spstr", spstr, ",", spstrbrack
                                result = memberList.matchwmsname(spstr, spstrbrack, sdate)
                        except Exception, e:
                                raise ContextException(str(e), stamp=stampurl, fragment=fss)

                        # put record in thisplace
                        spxm = '%s<speaker %s>%s</speaker>\n' % (anamestamp, result.encode("latin-1"), spstr)

                # nothing detected
                # check if we've missed anything obvious
                if recomb.match(fss):
                        raise ContextException('regexpvals not general enough', fragment=fss, stamp=stampurl)
                if remarginal.search(fss):
                        raise ContextException(' marginal speaker detection case: %s' % remarginal.search(fss).group(0), fragment=fss, stamp=stampurl)

Exemple #4
def StripLordsDebateHeadings(headspeak, sdate):
    # check and strip the first two headings in as much as they are there
    ih = 0
    ih = StripDebateHeading('Initial', ih, headspeak)

    # House of Lords
    ih = StripDebateHeading('house of lords(?i)', ih, headspeak, True)

    # Thursday, 18th December 2003.
    mdateheading = re.match('(?:<stamp aname="[^"]*"/>)*([\w\s\d,]*)\.?',
    #time = TimeProcessing(timeg.group(1), previoustime, False, stampurl)
    #fout.write('<stamp time="%s"/>' % time)
    if not mdateheading or (sdate != mx.DateTime.DateTimeFrom(
            mdateheading.group(1)).date) or headspeak[ih][2]:
        print headspeak[ih]
        #raise ContextException('non-conforming date heading')  # recoverable?
        ih = ih + 1

    if re.match(
            '(?:<stamp aname="[^"]*"/>)*(?:THE )?(?i)QUEEN(?:\'|&....;)S SPEECH',
        print headspeak[ih][0]
        print "QUEENS SPEECH"
        # don't advance, because this is the heading (works for 2005-05-17)

    elif re.match("Parliament", headspeak[ih][0]):
        print "parliamentparliament"
        # don't advance; this is a title (works for 2005-05-11)

        #<H4><center>Reassembling after the Christmas Recess, the House met at half-past two of the clock: The LORD CHANCELLOR on the Woolsack.</center></H4>
        # The House met at eleven of the clock (Prayers having been read earlier at the Judicial Sitting by the Lord Bishop of St Albans): The CHAIRMAN OF COMMITTEES on the Woolsack.
        ih = StripDebateHeading(
            '(?:reassembling.*?recess, )?the house (?:met|resumed)(?: for Judicial Business)? at ([^(]*)(?i)',
            ih, headspeak, True)
        #print starttime. (we should use the "Half past two" business in house met to set it, unfortunately the filtercoltime has already happened

        # Prayers&#151;Read by the Lord Bishop of Southwell.
        ih = StripDebateHeading('prayers(?i)', ih, headspeak, True)

    # find the url, colnum and time stamps that occur before anything else in the unspoken text
    stampurl = StampUrl(sdate)
    #stampurl.timestamp = '<stamp( time="%s")/>', starttime)

    # set the time from the wording 'house met at' thing.
    for j in range(0, ih):

    if (not stampurl.stamp) or (not stampurl.pageurl):
        raise Exception, ' missing stamp url at beginning of file '
    return (ih, stampurl)
Exemple #5
def StripDebateHeadings(headspeak, sdate):
    # check and strip the first two headings in as much as they are there
    ih = 0
    ih = StripDebateHeading(
        'Initial', ih, headspeak
    )  # the 'Initial' is inserted by the splitheadingsspeakers function

    # volume type heading
    if re.search('THE$', headspeak[ih][0]):
        ih = StripDebateHeading('THE', ih, headspeak)
        ih = StripDebateHeading('PARLIAMENTARY(?:&nbsp;)+DEBATES', ih,
    elif re.search('THE PARLIAMENTARY DEBATES', headspeak[ih][0]):
        ih = StripDebateHeading('THE PARLIAMENTARY DEBATES', ih, headspeak)
    if re.search('OFFICIAL REPORT', headspeak[ih][0]):
        ih = StripDebateHeading('OFFICIAL REPORT', ih, headspeak)
        ih = StripDebateHeading(
            'IN THE .*? SESSION OF THE .*? PARLIAMENT OF THE', ih, headspeak,
        ih = StripDebateHeading(
            headspeak, True)
        ih = StripDebateHeading('\[WHICH OPENED .*?\]', ih, headspeak, True)
        ih = StripDebateHeading('.*? YEAR OF THE REIGN OF.*?', ih, headspeak,
        ih = StripDebateHeading('HER MAJESTY QUEEN ELIZABETH II', ih,
                                headspeak, True)
        ih = StripDebateHeading('SI.*? SERIES.*?VOLUME \d+', ih, headspeak,
        ih = StripDebateHeading('SI.*? SERIES', ih, headspeak, True)
        ih = StripDebateHeading('VOLUME \d+', ih, headspeak, True)
        ih = StripDebateHeading('.*? VOLUME OF SESSION .*?', ih, headspeak)

    #House of Commons
    ih = StripDebateHeading('house of commons(?i)', ih, headspeak)

    # Tuesday 9 December 2003
    if not re.match('the house met at .*(?i)', headspeak[ih][0]):
        givendate = re.sub('&nbsp;', ' ', headspeak[ih][0])
        givendate = re.sub('</?i>', ' ', givendate)
        gd = re.match('(?:<stamp aname="[^"]*"/>)*(.*)$(?i)', givendate)
        if gd:
            givendate = gd.group(1)
        if ((sdate !=
             mx.DateTime.DateTimeFrom(givendate).date)) or headspeak[ih][2]:
            raise Exception, 'date heading %s mismatches with date %s' % (repr(
                headspeak[ih]), sdate)
        ih = ih + 1

    gstarttime = None
    if sdate != "2001-06-13":
        #The House met at half-past Ten o'clock
        gstarttime = re.match(
            '(?:<stamp aname="[^"]*"/>)*(?:<i>)?\s*the\s+house (?:being |having )?met at?\s+(?:</i><i>\s*)?(.*?)(?:, and the Speaker-Elect having taken the Chair;)?(?:</i>)?$(?i)',
        if (not gstarttime) or headspeak[ih][2]:
            raise ContextException(
                'non-conforming "the house met at" heading %s' %
                repr(headspeak[ih]), "")
        ih = ih + 1

# Start of a new parliament is special
    if sdate not in ["2001-06-14", "2001-06-13", "2005-05-11", "2005-05-12"]:

        ih = StripDebateHeading('prayers(?i)', ih, headspeak, True)

        ih = StripDebateHeading('pursuant to the Standing Order\.', ih,
                                headspeak, True)

        # in the chair
        ih = StripDebateHeading('\[.*?[ >]in the chair[<>i/\.]*\](?i)', ih,
                                headspeak, True)

    # find the url, colnum and time stamps that occur before anything else in the unspoken text
    stampurl = StampUrl(sdate)

    # set the time from the wording 'house met at' thing.
    if gstarttime:
        time = gstarttime.group(1)
        time = re.sub('</?i>', ' ', time)
        time = re.sub('\s+', ' ', time)
        if re.match("half-past Nine(?i)", time):
            newtime = '09:30:00'
        elif re.match("a quarter to Ten o(?i)", time):
            newtime = '09:45:00'
        elif re.match("Ten o'clock(?i)", time):
            newtime = '10:00:00'
        elif re.match("half-past Ten(?i)", time):
            newtime = '10:30:00'
        elif re.match("Eleven o&#039;clock(?i)", time):
            newtime = '11:00:00'
        elif re.match("twenty-five minutes past\s*Eleven(?i)", time):
            newtime = '11:25:00'
        elif re.match("twenty-six minutes past\s*Eleven(?i)", time):
            newtime = '11:26:00'
        elif re.match("twenty-nine minutes past\s*Eleven(?i)", time):
            newtime = '11:29:00'
        elif re.match("half-past Eleven(?i)", time):
            newtime = '11:30:00'
        elif re.match("Twelve noon(?i)", time):
            newtime = '12:00:00'
        elif re.match("half-past One(?i)", time):
            newtime = '13:30:00'
        elif re.match("half-past Two(?i)", time):
            newtime = '14:30:00'
        elif re.match("twenty minutes to Three(?i)", time):
            newtime = '14:40:00'
        elif re.match("10 minutes past Three(?i)", time):
            newtime = '15:10:00'
        elif re.match("Six o'clock(?i)", time):
            newtime = '18:00:00'
            raise ContextException, "Start time not known: " + time
        stampurl.timestamp = '<stamp time="%s"/>' % newtime

    for j in range(0, ih):

    if (not stampurl.stamp) or (not stampurl.pageurl):
        raise Exception, ' missing stamp url at beginning of file '
    return (ih, stampurl)
Exemple #6
def LordsFilterSpeakers(fout, text, sdate):
	stampurl = StampUrl(sdate)

	officematches = {}

	# setup for scanning through the file.
	for fss in respeaker.split(text):

		# strip off the bolds tags
		# get rid of non-bold stuff
		bffs = respeakerb.match(fss)
		if not bffs:


		# grab a trailing colon if there is one
		fssb = bffs.group(1)
		if bffs.group(2):
			fssb = fssb + ":"

                # Remove the cruft
                fssb = re.sub('<stamp aname="[^"]*"/>', '', fssb)
                fssb = re.sub('</b><b>', '', fssb)

		# empty bold phrase
		if not re.search('\S', fssb):

		# division/contents/amendment which means this is not a speaker
		if renonspek.search(fssb):

		# part of quotes as an inserted title in an amendment
		if re.match('("|\[|&quot;)', fssb):

		# another title type (all caps), or a clause number
		if not re.search('[a-z]', fssb):

		# start piecing apart the name by office and leadout type
		namec = respeakervals.match(fssb)
		if not namec:
			print '*', fssb, '*'
			raise ContextException("bad format", stamp=stampurl, fragment=fssb)

		if namec.group('bracket'):
			name = re.sub('\s+', ' ', namec.group('bracket'))
			loffice = re.sub('\s+', ' ', namec.group('name'))
			name = re.sub('\s+', ' ', namec.group('name'))
			loffice = None

		colon = namec.group('colon')
		if not colon:
			colon = ""

		# get rid of some standard ones
		if re.match('the lord chancellor|noble lords|a noble lord|a noble baroness|the speaker(?i)', name):
			fout.write('<speaker person_id="%s" speakername="%s">%s</speaker>' % ('unknown', name, name))

		# map through any office information
		if loffice:
			if (not re.match("The (Deputy |Minister of State)", loffice)) and (loffice in officematches):
                                if sdate!='2014-09-26' and sdate!='2012-09-24' and officematches[loffice] != name:
                                        raise ContextException("office inconsistency, loffice: %s name: %s officematches: %s" % (loffice, name, officematches[loffice]), stamp=stampurl, fragment=fssb)
				officematches[loffice] = name
		elif name in officematches:
			loffice = name
			name = officematches[loffice]

		if regenericspeak.match(name):
			fout.write('<speaker person_id="%s" speakername="%s">%s</speaker>' % ('unknown', name, name))

		lsid = lordsList.GetLordIDfname(name, loffice=loffice, sdate=sdate, stampurl=stampurl)  # maybe throw the exception on the outside

                if not lsid:
                        fout.write('<speaker person_id="unknown" error="No match" speakername="%s" colon="%s">%s</speaker>' % (name, colon, name))
                        fout.write('<speaker person_id="%s" speakername="%s" colon="%s">%s</speaker>' % (lsid, name, colon, name))

                if namec.group('maiden'):
                        fout.write('<i>%s</i>' % namec.group('maiden'))
Exemple #7
def FilterDebateSpeakers(fout, text, sdate, typ):

	if typ == "westminhall":
		depspeakerrg = re.search("\[(.*?)(?:<i>)? ?in the Chair(?:</i>)?\]", text)
		if not depspeakerrg:
			raise ContextException("Can't find the [... in the Chair] phrase")
		depspeaker = depspeakerrg.group(1)

	# old style fixing (before patches existed)
	if typ == "debate":
		text = ApplyFixSubstitutions(text, sdate, fixsubs)

        # for error messages
	stampurl = StampUrl(sdate)

        # Fix missing bold tags around names
        missingbolds = re.findall('(\n?<p>(?:<stamp aname="[^"]+"/>)+)((?:<b></b>)?\s*)([A-Za-z.\-\s]+)((?:\([^)]*\)\s*)*)(:\s)', text)
        for p1,p2,p3,p4,p5 in missingbolds:
                missingbold = "%s%s%s%s%s" % (p1,p2,p3,p4,p5)
                bold = "%s<b>%s%s%s</b>" % (p1,p3,p4,p5)
                namematches = memberList.fullnametoids(p3, sdate)
                if namematches:
                        if not missingbold in text:
                                print "ERROR: missing bold text found, but then vanished when replacing"
                        text = text.replace(missingbold, bold)

        # Move Urgent Question out of speaker name
        urgentqns = re.findall('(<p>(?:<stamp aname="[^"]+"/>\s*)+)(<b>[^<]*?)(\s*<i>\s*\(Urgent Question\)</i>\s*)(:</b>)(?i)', text)
        for p1,p2,p3,p4 in urgentqns:
                urgentqn = "%s%s%s%s" % (p1,p2,p3,p4)
                correction = "%s%s%s%s" % (p1,p2,p4,p3)
                text = text.replace(urgentqn, correction)

	# setup for scanning through the file.
	for fss in recomb.split(text):
                #print fss
                #print "--------------------"

		# division number detection (these get through the speaker detection regexp)
		if redivno.match(fss) or retabletext.match(fss):

		# CORRECTION title (these also get through) -- both these are surrounded by <center> tags usually.
		if fss == "<b>CORRECTION</b>":

                if re.match('<b>(&#8220;)?([0-9]+[A-Z]* .*|(CHAPTER|PART) [0-9]+[A-Z]*|[A-Z, ]+)</b>$', fss):

		# speaker detection
		speakerg = respeakervals.match(fss)
		if speakerg:
			# optional parts of the group
			# we can use oqnum to detect oral questions
			anamestamp = speakerg.group(4) or speakerg.group(3) or ""
			oqnum = speakerg.group(1)
			if speakerg.group(5):
				assert not oqnum
				oqnum = speakerg.group(5)
			if oqnum:
				oqnum = ' oral-qnum="%s"' % oqnum
				oqnum = ""

			# the preceding square bracket qnums
			sqbnum = speakerg.group(2) or ""

			party = speakerg.group(8) or speakerg.group(10)

			spstr = string.strip(speakerg.group(6))
			spstrbrack = speakerg.group(7) or speakerg.group(9) # the bracketted phrase (sometimes the constituency or name if it is a minister)
                        if spstrbrack:
                                spstrbrack = re.sub("\n", ' ', spstrbrack)

			# do quick substitution for dep speakers in westminster hall
			if typ == "westminhall" and re.search("deputy[ \-]speaker(?i)", spstr) and not spstrbrack:
				#spstrbrack = depspeaker
				spstr = depspeaker

			# match the member to a unique identifier and displayname
				#print "spstr", spstr, ",", spstrbrack
				#print speakerg.groups()
				result = memberList.matchdebatename(spstr, spstrbrack, sdate, typ)
			except Exception, e:
				# add extra stamp info to the exception
				raise ContextException(str(e), stamp=stampurl, fragment=fss)

			# put record in this place
			#print "ree", result.encode("latin-1")
			spxm = '%s<speaker %s%s>%s</speaker>\n%s' % (anamestamp, result.encode("latin-1"), oqnum, spstr, sqbnum)

		# nothing detected
		# check if we've missed anything obvious
		if recomb.match(fss):
			raise ContextException('regexpvals not general enough', fragment=fss, stamp=stampurl)
		if remarginal.search(fss):
			raise ContextException(' marginal speaker detection case: %s' % remarginal.search(fss).group(0), fragment=fss, stamp=stampurl)

		# this is where we phase in the ascii encoding
Exemple #8
def FilterWransSpeakers(fout, text, sdate):
    text = ApplyFixSubstitutions(text, sdate, fixsubs)

    # Fix things like this, to put bold in. We use bold below to detect names, but
    # occasionally the reporters miss it out, and we catch such cases here:
    # <p><a name="qnpa_0">Caroline Flint: This information is not held centrally. </p>
    # <p><a name="qnpa_15">Ms Harman: The information can be found in the following table. </p>
    missingbolds = re.findall(
        '(\n?<p>(?:<stamp aname="[^"]+"/>)+)((?:<b></b>)?\s*)([A-Za-z.\-\s]+)(:\s)',
    for p1, p2, p3, p4 in missingbolds:
        missingbold = "%s%s%s%s" % (p1, p2, p3, p4)
        bold = "%s<b>%s%s</b>" % (p1, p3, p4)
        namematches = memberList.fullnametoids(p3, sdate)
        # Only fix if we found a matching name in the middle (and do it even if ambiguous)
        if namematches:
            #print "Fixing missing bold, had name matches:\n\t%s\n\t%s" % (missingbold.strip(), bold.strip())
            if not missingbold in text:
                print "ERROR: missing bold text found, but then vanished when replacing"
            text = text.replace(missingbold, bold)
        #print "Plausible missing bold not fixed, as no name matches:\n\t%s\n\t%s" % (missingbold.strip(), bold.strip())

    # <B> Mrs. Iris Robinson: </B>
    lspeakerregexp = '<b>.*?</b>(?:\s*:)?'
    ltableregexp = '<table[^>]*>[\s\S]*?</table>'  # these have bolds, so must be separated out
    tableregexp = ltableregexp + '(?i)'

    lregexp = '(%s|%s)(?i)' % (ltableregexp, lspeakerregexp)

    # setup for scanning through the file.
    fs = re.split(lregexp, text)

    # for error messages
    stampurl = StampUrl(sdate)

    for i in range(len(fs)):
        fss = fs[i]
        fss = stampurl.UpdateStampUrl(fss)  # Speakers have new stamps in them

        if re.match(tableregexp, fss):

        speakerg = re.findall('<b>\s*([^:]*)[:\s]*?([^<:]*)</b>(?i)', fss)
        if not speakerg:

        # we have a string in bold
        boldnamestring = string.strip(speakerg[0][0])

        # trailing text after the colon in the bold speech bit
        if re.search('\S', speakerg[0][1]):
            fs[i + 1] = speakerg[0][1] + fs[i + 1]

        # push the square brackets outside of the boldstring if there is one
        # <B> Mr. Miliband [ </B> <i>holding answer 24 March</i>]:
        sqb = re.findall('^([^\[]*)(\[.*)$', boldnamestring)
        if sqb:
            boldnamestring = string.strip(sqb[0][0])
            fs[i + 1] = sqb[0][1] + fs[i + 1]

        # get rid of blank bold strings
        if not re.search('\S', boldnamestring):
            fs[i] = ''

        # try to pull in the question number if preceding
        # These signify aborted oral questions, and are normally
        # useless and at the start of the page.
        # 27. <B> Mr. Steen: </B>
        if i > 0:
            oqnsep = re.findall(
                '^([\s\S]*?)Q?(\d+\.?)(\s*?(?:<stamp aname=".*?"/>)?)$',
                fs[i - 1])
            if oqnsep:
                fs[i - 1] = oqnsep[0][0] + oqnsep[0][2]
                boldnamestring = oqnsep[0][1] + ' ' + boldnamestring

        # take out the initial digits and a dot which we may have just put in
        # (although sometimes it would have already been there)
        robj = re.match(r"(\d*\.? )(.*)$", boldnamestring)
        deci = None
        if robj:
            (deci, boldnamestring) = robj.groups()
            # TODO: do something with deci here (it is the "failed
            # oral questions" signifier)

        # see if it is an explicitly bad/ambiguous name which will never match
        if boldnamestring.find('<broken-name>') >= 0:
            person_id = 'unknown'
            boldnamestring = boldnamestring.replace('<broken-name>', '')
            remadename = ' speakername="%s" error="Name ambiguous in Hansard"' % (
            # split bracketed cons out if present
            brakmatch = re.match("(.*)\s+\((.*)\)", boldnamestring)
            if brakmatch:
                (name, cons) = brakmatch.groups()
                (name, cons) = (boldnamestring, None)

            # match the member to a unique identifier
            (person_id, remadename,
             remadecons) = memberList.matchfullnamecons(name,
            if person_id and remadename:
                remadename = ' speakername="%s"' % (remadename)
            if not person_id:
                if remadename == "MultipleMatch":
                    if boldnamestring == 'Mr. Michael Foster':
                        if remadecons[0] == 'uk.org.publicwhip/person/10209':
                            person_id = remadecons[0]
                            remadename = ' speakername="Michael Foster"'
                            remadecons = 'Worcester'
                        person_id = 'unknown'
                        remadename = ' speakername="%s" error="MultipleMatch"' % boldnamestring
                elif boldnamestring == 'Jim Dobbin' and sdate == '2014-09-08':
                    person_id = 'uk.org.publicwhip/person/10170'
                    remadename = ' speakername="Jim Dobbin"'
                    print "  No name,const match (%s,%s)" % (name, cons)
                    raise ContextException("No name match",

        # put record in this place
        fs[i] = '<speaker person_id="%s"%s>%s</speaker>\n' % \
          (person_id.encode("latin-1"), remadename.encode("latin-1"), boldnamestring)

    # scan through everything and output it into the file