Ejemplo n.º 1
0
def FilterDebateSpeakers(fout, text, sdate, typ):

	if typ == "westminhall":
		depspeakerrg = re.search("\[(.*?)(?:<i>)? ?in the Chair(?:</i>)?\]", text)
		if not depspeakerrg:
			raise ContextException("Can't find the [... in the Chair] phrase")
					
		depspeaker = depspeakerrg.group(1)

	# old style fixing (before patches existed)
	if typ == "debate":
		text = ApplyFixSubstitutions(text, sdate, fixsubs)

        # for error messages
	stampurl = StampUrl(sdate)

        # Fix missing bold tags around names
        missingbolds = re.findall('(\n?<p>(?:<stamp aname="[^"]+"/>)+)((?:<b></b>)?\s*)([A-Za-z.\-\s]+)((?:\([^)]*\)\s*)*)(:\s)', text)
        for p1,p2,p3,p4,p5 in missingbolds:
                missingbold = "%s%s%s%s%s" % (p1,p2,p3,p4,p5)
                bold = "%s<b>%s%s%s</b>" % (p1,p3,p4,p5)
                namematches = memberList.fullnametoids(p3, sdate)
                if namematches:
                        if not missingbold in text:
                                print "ERROR: missing bold text found, but then vanished when replacing"
                        text = text.replace(missingbold, bold)

        # Move Urgent Question out of speaker name
        urgentqns = re.findall('(<p>(?:<stamp aname="[^"]+"/>)+)(<b>[^<]*?)(\s*<i>\s*\(Urgent Question\)</i>)(:</b>)(?i)', text)
        for p1,p2,p3,p4 in urgentqns:
                urgentqn = "%s%s%s%s" % (p1,p2,p3,p4)
                correction = "%s%s%s%s" % (p1,p2,p4,p3)
                text = text.replace(urgentqn, correction)

	# setup for scanning through the file.
	for fss in recomb.split(text):
		stampurl.UpdateStampUrl(fss)
                #print fss
                #print "--------------------"

		# division number detection (these get through the speaker detection regexp)
		if redivno.match(fss):
			fout.write(fss.encode("latin-1"))
			continue

		# CORRECTION title (these also get through) -- both these are surrounded by <center> tags usually.
		if fss == "<b>CORRECTION</b>":
			fout.write(fss.encode("latin-1"))
			continue

		# speaker detection
		speakerg = respeakervals.match(fss)
		if speakerg:
			# optional parts of the group
			# we can use oqnum to detect oral questions
			anamestamp = speakerg.group(4) or speakerg.group(3) or ""
			oqnum = speakerg.group(1)
			if speakerg.group(5):
				assert not oqnum
				oqnum = speakerg.group(5)
			if oqnum:
				oqnum = ' oral-qnum="%s"' % oqnum
			else:
				oqnum = ""

			# the preceding square bracket qnums
			sqbnum = speakerg.group(2) or ""

			party = speakerg.group(8) or speakerg.group(10)

			spstr = string.strip(speakerg.group(6))
			spstrbrack = speakerg.group(7) or speakerg.group(9) # the bracketted phrase (sometimes the constituency or name if it is a minister)
                        if spstrbrack:
                                spstrbrack = re.sub("\n", ' ', spstrbrack)

			# do quick substitution for dep speakers in westminster hall
			if typ == "westminhall" and re.search("deputy[ \-]speaker(?i)", spstr) and not spstrbrack:
				#spstrbrack = depspeaker
				spstr = depspeaker

			# match the member to a unique identifier and displayname
			try:
				#print "spstr", spstr, ",", spstrbrack
				#print speakerg.groups()
				result = memberList.matchdebatename(spstr, spstrbrack, sdate, typ)
			except Exception, e:
				# add extra stamp info to the exception
				raise ContextException(str(e), stamp=stampurl, fragment=fss)

			# put record in this place
			#print "ree", result.encode("latin-1")
			spxm = '%s<speaker %s%s>%s</speaker>\n%s' % (anamestamp, result.encode("latin-1"), oqnum, spstr, sqbnum)
			fout.write(spxm)
			continue


		# nothing detected
		# check if we've missed anything obvious
		if recomb.match(fss):
			raise ContextException('regexpvals not general enough', fragment=fss, stamp=stampurl)
		if remarginal.search(fss):
			raise ContextException(' marginal speaker detection case: %s' % remarginal.search(fss).group(0), fragment=fss, stamp=stampurl)

		# this is where we phase in the ascii encoding
		fout.write(fss)
Ejemplo n.º 2
0
def FilterDebateSpeakers(fout, text, sdate, typ):

	if typ == "westminhall":
		depspeakerrg = re.search("\[(.*?)(?:<i>)? ?in the Chair(?:</i>)?\]", text)
		if not depspeakerrg:
			raise ContextException("Can't find the [... in the Chair] phrase")
					
		depspeaker = depspeakerrg.group(1)

	# old style fixing (before patches existed)
	if typ == "debate":
		text = ApplyFixSubstitutions(text, sdate, fixsubs)

        # for error messages
	stampurl = StampUrl(sdate)

        # Fix missing bold tags around names
        missingbolds = re.findall('(\n?<p>(?:<stamp aname="[^"]+"/>)+)((?:<b></b>)?\s*)([A-Za-z.\-\s]+)((?:\([^)]*\)\s*)*)(:\s)', text)
        for p1,p2,p3,p4,p5 in missingbolds:
                missingbold = "%s%s%s%s%s" % (p1,p2,p3,p4,p5)
                bold = "%s<b>%s%s%s</b>" % (p1,p3,p4,p5)
                namematches = memberList.fullnametoids(p3, sdate)
                if namematches:
                        if not missingbold in text:
                                print "ERROR: missing bold text found, but then vanished when replacing"
                        text = text.replace(missingbold, bold)

        # Move Urgent Question out of speaker name
        urgentqns = re.findall('(<p>(?:<stamp aname="[^"]+"/>\s*)+)(<b>[^<]*?)(\s*<i>\s*\(Urgent Question\)</i>\s*)(:</b>)(?i)', text)
        for p1,p2,p3,p4 in urgentqns:
                urgentqn = "%s%s%s%s" % (p1,p2,p3,p4)
                correction = "%s%s%s%s" % (p1,p2,p4,p3)
                text = text.replace(urgentqn, correction)

	# setup for scanning through the file.
	for fss in recomb.split(text):
		stampurl.UpdateStampUrl(fss)
                #print fss
                #print "--------------------"

		# division number detection (these get through the speaker detection regexp)
		if redivno.match(fss) or retabletext.match(fss):
			fout.write(fss.encode("latin-1"))
			continue

		# CORRECTION title (these also get through) -- both these are surrounded by <center> tags usually.
		if fss == "<b>CORRECTION</b>":
			fout.write(fss.encode("latin-1"))
			continue

                if re.match('<b>(&#8220;)?([0-9]+[A-Z]* .*|(CHAPTER|PART) [0-9]+[A-Z]*|[A-Z, ]+)</b>$', fss):
		        fout.write(fss)
                        continue

		# speaker detection
		speakerg = respeakervals.match(fss)
		if speakerg:
			# optional parts of the group
			# we can use oqnum to detect oral questions
			anamestamp = speakerg.group(4) or speakerg.group(3) or ""
			oqnum = speakerg.group(1)
			if speakerg.group(5):
				assert not oqnum
				oqnum = speakerg.group(5)
			if oqnum:
				oqnum = ' oral-qnum="%s"' % oqnum
			else:
				oqnum = ""

			# the preceding square bracket qnums
			sqbnum = speakerg.group(2) or ""

			party = speakerg.group(8) or speakerg.group(10)

			spstr = string.strip(speakerg.group(6))
			spstrbrack = speakerg.group(7) or speakerg.group(9) # the bracketted phrase (sometimes the constituency or name if it is a minister)
                        if spstrbrack:
                                spstrbrack = re.sub("\n", ' ', spstrbrack)

			# do quick substitution for dep speakers in westminster hall
			if typ == "westminhall" and re.search("deputy[ \-]speaker(?i)", spstr) and not spstrbrack:
				#spstrbrack = depspeaker
				spstr = depspeaker

			# match the member to a unique identifier and displayname
			try:
				#print "spstr", spstr, ",", spstrbrack
				#print speakerg.groups()
				result = memberList.matchdebatename(spstr, spstrbrack, sdate, typ)
			except Exception, e:
				# add extra stamp info to the exception
				raise ContextException(str(e), stamp=stampurl, fragment=fss)

			# put record in this place
			#print "ree", result.encode("latin-1")
			spxm = '%s<speaker %s%s>%s</speaker>\n%s' % (anamestamp, result.encode("latin-1"), oqnum, spstr, sqbnum)
			fout.write(spxm)
			continue


		# nothing detected
		# check if we've missed anything obvious
		if recomb.match(fss):
			raise ContextException('regexpvals not general enough', fragment=fss, stamp=stampurl)
		if remarginal.search(fss):
			raise ContextException(' marginal speaker detection case: %s' % remarginal.search(fss).group(0), fragment=fss, stamp=stampurl)

		# this is where we phase in the ascii encoding
		fout.write(fss)
Ejemplo n.º 3
0
def FilterWransSpeakers(fout, text, sdate):
	text = ApplyFixSubstitutions(text, sdate, fixsubs)

        # Fix things like this, to put bold in. We use bold below to detect names, but
        # occasionally the reporters miss it out, and we catch such cases here:
        # <p><a name="qnpa_0">Caroline Flint: This information is not held centrally. </p>
        # <p><a name="qnpa_15">Ms Harman: The information can be found in the following table. </p>
        missingbolds = re.findall('(\n?<p>(?:<stamp aname="[^"]+"/>)+)((?:<b></b>)?\s*)([A-Za-z.\-\s]+)(:\s)', text)
        for p1,p2,p3,p4 in missingbolds:
                missingbold = "%s%s%s%s" % (p1,p2,p3,p4)
                bold = "%s<b>%s%s</b>" % (p1,p3,p4)
                namematches = memberList.fullnametoids(p3, sdate)
                # Only fix if we found a matching name in the middle (and do it even if ambiguous)
                if namematches:
                        #print "Fixing missing bold, had name matches:\n\t%s\n\t%s" % (missingbold.strip(), bold.strip())
                        if not missingbold in text:
                                print "ERROR: missing bold text found, but then vanished when replacing"
                        text = text.replace(missingbold, bold)
                #else:
                        #print "Plausible missing bold not fixed, as no name matches:\n\t%s\n\t%s" % (missingbold.strip(), bold.strip())

	# <B> Mrs. Iris Robinson: </B>
	lspeakerregexp = '<b>.*?</b>(?:\s*:)?'
	ltableregexp = '<table[^>]*>[\s\S]*?</table>'	# these have bolds, so must be separated out
	tableregexp = ltableregexp + '(?i)'

	lregexp = '(%s|%s)(?i)' % (ltableregexp, lspeakerregexp)

	# setup for scanning through the file.
	fs = re.split(lregexp, text)

        # for error messages
	stampurl = StampUrl(sdate)


	for i in range(len(fs)):
		fss = fs[i]
		fss = stampurl.UpdateStampUrl(fss) # Speakers have new stamps in them

		if re.match(tableregexp, fss):
			continue

		speakerg = re.findall('<b>\s*([^:]*)[:\s]*?([^<:]*)</b>(?i)', fss)
		if not speakerg:
			continue

		# we have a string in bold
		boldnamestring = string.strip(speakerg[0][0])

		# trailing text after the colon in the bold speech bit
		if re.search('\S', speakerg[0][1]):
			fs[i+1] = speakerg[0][1] + fs[i+1]


		# push the square brackets outside of the boldstring if there is one
		# <B> Mr. Miliband [ </B> <i>holding answer 24 March</i>]:
		sqb = re.findall('^([^\[]*)(\[.*)$', boldnamestring)
		if sqb:
			boldnamestring = string.strip(sqb[0][0])
			fs[i+1] = sqb[0][1] + fs[i+1]

		# get rid of blank bold strings
		if not re.search('\S', boldnamestring):
			fs[i] = ''
			continue

		# try to pull in the question number if preceeding
		# These signify aborted oral questions, and are normally
		# useless and at the start of the page.
		# 27. <B> Mr. Steen: </B>
		if i > 0:
			oqnsep = re.findall('^([\s\S]*?)Q?(\d+\.?)(\s*?(?:<stamp aname=".*?"/>)?)$', fs[i-1])
			if oqnsep:
				fs[i-1] = oqnsep[0][0] + oqnsep[0][2]
				boldnamestring = oqnsep[0][1] + ' ' + boldnamestring

		# take out the initial digits and a dot which we may have just put in
		# (although sometimes it would have already been there)
		robj = re.match(r"(\d*\.? )(.*)$", boldnamestring)
		deci = None
		if robj:
			(deci, boldnamestring) = robj.groups()
			# TODO: do something with deci here (it is the "failed
			# oral questions" signifier)

		# see if it is an explicitly bad/ambiguous name which will never match
		if boldnamestring.find('<broken-name>') >= 0:
			id = 'unknown'
			boldnamestring = boldnamestring.replace('<broken-name>', '')
			remadename = ' speakername="%s" error="Name ambiguous in Hansard"' % (boldnamestring)
		else:
			# split bracketed cons out if present
			brakmatch = re.match("(.*)\s+\((.*)\)", boldnamestring)
			if brakmatch:
				(name, cons) = brakmatch.groups()
			else:
				(name, cons) = (boldnamestring, None)

			# match the member to a unique identifier
			(id, remadename, remadecons) = memberList.matchfullnamecons(name, cons, sdate, alwaysmatchcons = False)
			if id and remadename:
				remadename = ' speakername="%s"' % (remadename)
			if not id:
				if remadename == "MultipleMatch":
                                        if boldnamestring == 'Mr. Michael Foster':
                                                if remadecons[1] == 'uk.org.publicwhip/member/1939':
                                                        id = remadecons[1]
                                                        remadename = ' speakername="Michael Foster"'
                                                        remadecons = 'Worcester'
                                                elif remadecons[0] == 'uk.org.publicwhip/member/896':
                                                        id = remadecons[0]
                                                        remadename = ' speakername="Michael Foster"'
                                                        remadecons = 'Worcester'
                                        else:
        					id = 'unknown'
        					remadename = ' speakername="%s" error="MultipleMatch"' % boldnamestring
				elif boldnamestring == 'Jim Dobbin' and sdate == '2014-09-08':
					id = 'uk.org.publicwhip/member/40316'
					remadename = ' speakername="Jim Dobbin"'
				else:
					print "  No name,const match (%s,%s)" % (name, cons)
					raise ContextException("No name match", stamp=stampurl, fragment=boldnamestring)


		# put record in this place
		fs[i] = '<speaker speakerid="%s"%s>%s</speaker>\n' % \
				(id.encode("latin-1"), remadename.encode("latin-1"), boldnamestring)


	# scan through everything and output it into the file
	fout.writelines(fs)
Ejemplo n.º 4
0
def FilterWransSpeakers(fout, text, sdate):
    text = ApplyFixSubstitutions(text, sdate, fixsubs)

    # Fix things like this, to put bold in. We use bold below to detect names, but
    # occasionally the reporters miss it out, and we catch such cases here:
    # <p><a name="qnpa_0">Caroline Flint: This information is not held centrally. </p>
    # <p><a name="qnpa_15">Ms Harman: The information can be found in the following table. </p>
    missingbolds = re.findall(
        '(\n?<p>(?:<stamp aname="[^"]+"/>)+)((?:<b></b>)?\s*)([A-Za-z.\-\s]+)(:\s)',
        text)
    for p1, p2, p3, p4 in missingbolds:
        missingbold = "%s%s%s%s" % (p1, p2, p3, p4)
        bold = "%s<b>%s%s</b>" % (p1, p3, p4)
        namematches = memberList.fullnametoids(p3, sdate)
        # Only fix if we found a matching name in the middle (and do it even if ambiguous)
        if namematches:
            #print "Fixing missing bold, had name matches:\n\t%s\n\t%s" % (missingbold.strip(), bold.strip())
            if not missingbold in text:
                print "ERROR: missing bold text found, but then vanished when replacing"
            text = text.replace(missingbold, bold)
        #else:
        #print "Plausible missing bold not fixed, as no name matches:\n\t%s\n\t%s" % (missingbold.strip(), bold.strip())

    # <B> Mrs. Iris Robinson: </B>
    lspeakerregexp = '<b>.*?</b>(?:\s*:)?'
    ltableregexp = '<table[^>]*>[\s\S]*?</table>'  # these have bolds, so must be separated out
    tableregexp = ltableregexp + '(?i)'

    lregexp = '(%s|%s)(?i)' % (ltableregexp, lspeakerregexp)

    # setup for scanning through the file.
    fs = re.split(lregexp, text)

    # for error messages
    stampurl = StampUrl(sdate)

    for i in range(len(fs)):
        fss = fs[i]
        fss = stampurl.UpdateStampUrl(fss)  # Speakers have new stamps in them

        if re.match(tableregexp, fss):
            continue

        speakerg = re.findall('<b>\s*([^:]*)[:\s]*?([^<:]*)</b>(?i)', fss)
        if not speakerg:
            continue

        # we have a string in bold
        boldnamestring = string.strip(speakerg[0][0])

        # trailing text after the colon in the bold speech bit
        if re.search('\S', speakerg[0][1]):
            fs[i + 1] = speakerg[0][1] + fs[i + 1]

        # push the square brackets outside of the boldstring if there is one
        # <B> Mr. Miliband [ </B> <i>holding answer 24 March</i>]:
        sqb = re.findall('^([^\[]*)(\[.*)$', boldnamestring)
        if sqb:
            boldnamestring = string.strip(sqb[0][0])
            fs[i + 1] = sqb[0][1] + fs[i + 1]

        # get rid of blank bold strings
        if not re.search('\S', boldnamestring):
            fs[i] = ''
            continue

        # try to pull in the question number if preceding
        # These signify aborted oral questions, and are normally
        # useless and at the start of the page.
        # 27. <B> Mr. Steen: </B>
        if i > 0:
            oqnsep = re.findall(
                '^([\s\S]*?)Q?(\d+\.?)(\s*?(?:<stamp aname=".*?"/>)?)$',
                fs[i - 1])
            if oqnsep:
                fs[i - 1] = oqnsep[0][0] + oqnsep[0][2]
                boldnamestring = oqnsep[0][1] + ' ' + boldnamestring

        # take out the initial digits and a dot which we may have just put in
        # (although sometimes it would have already been there)
        robj = re.match(r"(\d*\.? )(.*)$", boldnamestring)
        deci = None
        if robj:
            (deci, boldnamestring) = robj.groups()
            # TODO: do something with deci here (it is the "failed
            # oral questions" signifier)

        # see if it is an explicitly bad/ambiguous name which will never match
        if boldnamestring.find('<broken-name>') >= 0:
            person_id = 'unknown'
            boldnamestring = boldnamestring.replace('<broken-name>', '')
            remadename = ' speakername="%s" error="Name ambiguous in Hansard"' % (
                boldnamestring)
        else:
            # split bracketed cons out if present
            brakmatch = re.match("(.*)\s+\((.*)\)", boldnamestring)
            if brakmatch:
                (name, cons) = brakmatch.groups()
            else:
                (name, cons) = (boldnamestring, None)

            # match the member to a unique identifier
            (person_id, remadename,
             remadecons) = memberList.matchfullnamecons(name,
                                                        cons,
                                                        sdate,
                                                        alwaysmatchcons=False)
            if person_id and remadename:
                remadename = ' speakername="%s"' % (remadename)
            if not person_id:
                if remadename == "MultipleMatch":
                    if boldnamestring == 'Mr. Michael Foster':
                        if remadecons[0] == 'uk.org.publicwhip/person/10209':
                            person_id = remadecons[0]
                            remadename = ' speakername="Michael Foster"'
                            remadecons = 'Worcester'
                    else:
                        person_id = 'unknown'
                        remadename = ' speakername="%s" error="MultipleMatch"' % boldnamestring
                elif boldnamestring == 'Jim Dobbin' and sdate == '2014-09-08':
                    person_id = 'uk.org.publicwhip/person/10170'
                    remadename = ' speakername="Jim Dobbin"'
                else:
                    print "  No name,const match (%s,%s)" % (name, cons)
                    raise ContextException("No name match",
                                           stamp=stampurl,
                                           fragment=boldnamestring)

        # put record in this place
        fs[i] = '<speaker person_id="%s"%s>%s</speaker>\n' % \
          (person_id.encode("latin-1"), remadename.encode("latin-1"), boldnamestring)

    # scan through everything and output it into the file
    fout.writelines(fs)