Esempio n. 1
0
print memberList.matchfullnamecons(u"Si\xf4n Simon", "Birmingham Erdington", "2006-01-22")
sys.exit(0)

print lordsList.GetLordIDfname('Baroness Thatcher', None, '2006-05-01')
print lordsList.GetLordIDfname('The Archbishop of York', None, '2006-05-01')
print lordsList.GetLordIDfname('The Bishop of Southwell and Nottingham', None, '2006-05-01')

print memberList.matchfullnamecons("Anne Moffat", "East Lothian", "2006-01-22")
print memberList.matchfullnamecons("Anne Picking", "East Lothian", "2006-01-22")
print memberList.matchfullnamecons("Anne Moffat", "East Lothian", "2004-01-22")
print memberList.matchfullnamecons("Anne Picking", "East Lothian", "2004-01-22")

print memberList.canonicalcons("Aberdeen North", "2001-01-01")
print memberList.canonicalcons("Aberdeen North", "2005-05-06")

print memberList.matchdebatename("Solicitor-General", None, "2003-11-21")
print memberList.matchdebatename("The Advocate-General for Scotland", None, "2004-07-30")

print memberList.getmembersoneelection("uk.org.publicwhip/member/1238")
print memberList.getmembersoneelection("uk.org.publicwhip/member/1353")
print memberList.getmembersoneelection("uk.org.publicwhip/member/1357")

print memberList.matchdebatename("Mr. Mackay", None, "2003-11-21")
print memberList.matchdebatename("James Marshall", None, "2003-11-21")
print memberList.matchdebatename("Gareth Thomas", "Clwyd, West", "2003-11-21")
print memberList.matchdebatename("Gareth Thomas", None, "2005-05-07")

print memberList.matchfullnamecons("Mr. MacDonald", "Western Isles", "2005-04-01")
print memberList.matchfullnamecons("Mr. MacNeil", "Na h-Eileanan an Iar", "2005-04-01")
print memberList.matchfullnamecons("Mr. MacDonald", "Western Isles", "2005-05-07")
print memberList.matchfullnamecons("Mr. MacNeil", "Na h-Eileanan an Iar", "2005-05-07")
Esempio n. 2
0
print memberList.matchfullnamecons(u"Si\xf4n Simon", "Birmingham Erdington", "2006-01-22")
sys.exit(0)

print lordsList.GetLordIDfname('Baroness Thatcher', None, '2006-05-01')
print lordsList.GetLordIDfname('The Archbishop of York', None, '2006-05-01')
print lordsList.GetLordIDfname('The Bishop of Southwell and Nottingham', None, '2006-05-01')

print memberList.matchfullnamecons("Anne Moffat", "East Lothian", "2006-01-22")
print memberList.matchfullnamecons("Anne Picking", "East Lothian", "2006-01-22")
print memberList.matchfullnamecons("Anne Moffat", "East Lothian", "2004-01-22")
print memberList.matchfullnamecons("Anne Picking", "East Lothian", "2004-01-22")

print memberList.canonicalcons("Aberdeen North", "2001-01-01")
print memberList.canonicalcons("Aberdeen North", "2005-05-06")

print memberList.matchdebatename("Solicitor-General", None, "2003-11-21")
print memberList.matchdebatename("The Advocate-General for Scotland", None, "2004-07-30")

print memberList.getmembersoneelection("uk.org.publicwhip/member/1238")
print memberList.getmembersoneelection("uk.org.publicwhip/member/1353")
print memberList.getmembersoneelection("uk.org.publicwhip/member/1357")

print memberList.matchdebatename("Mr. Mackay", None, "2003-11-21")
print memberList.matchdebatename("James Marshall", None, "2003-11-21")
print memberList.matchdebatename("Gareth Thomas", "Clwyd, West", "2003-11-21")
print memberList.matchdebatename("Gareth Thomas", None, "2005-05-07")

print memberList.matchfullnamecons("Mr. MacDonald", "Western Isles", "2005-04-01")
print memberList.matchfullnamecons("Mr. MacNeil", "Na h-Eileanan an Iar", "2005-04-01")
print memberList.matchfullnamecons("Mr. MacDonald", "Western Isles", "2005-05-07")
print memberList.matchfullnamecons("Mr. MacNeil", "Na h-Eileanan an Iar", "2005-05-07")
Esempio n. 3
0
def FilterDebateSpeakers(fout, text, sdate, typ):

	if typ == "westminhall":
		depspeakerrg = re.search("\[(.*?)(?:<i>)? ?in the Chair(?:</i>)?\]", text)
		if not depspeakerrg:
			raise ContextException("Can't find the [... in the Chair] phrase")
					
		depspeaker = depspeakerrg.group(1)

	# old style fixing (before patches existed)
	if typ == "debate":
		text = ApplyFixSubstitutions(text, sdate, fixsubs)

        # for error messages
	stampurl = StampUrl(sdate)

        # Fix missing bold tags around names
        missingbolds = re.findall('(\n?<p>(?:<stamp aname="[^"]+"/>)+)((?:<b></b>)?\s*)([A-Za-z.\-\s]+)((?:\([^)]*\)\s*)*)(:\s)', text)
        for p1,p2,p3,p4,p5 in missingbolds:
                missingbold = "%s%s%s%s%s" % (p1,p2,p3,p4,p5)
                bold = "%s<b>%s%s%s</b>" % (p1,p3,p4,p5)
                namematches = memberList.fullnametoids(p3, sdate)
                if namematches:
                        if not missingbold in text:
                                print "ERROR: missing bold text found, but then vanished when replacing"
                        text = text.replace(missingbold, bold)

        # Move Urgent Question out of speaker name
        urgentqns = re.findall('(<p>(?:<stamp aname="[^"]+"/>)+)(<b>[^<]*?)(\s*<i>\s*\(Urgent Question\)</i>)(:</b>)(?i)', text)
        for p1,p2,p3,p4 in urgentqns:
                urgentqn = "%s%s%s%s" % (p1,p2,p3,p4)
                correction = "%s%s%s%s" % (p1,p2,p4,p3)
                text = text.replace(urgentqn, correction)

	# setup for scanning through the file.
	for fss in recomb.split(text):
		stampurl.UpdateStampUrl(fss)
                #print fss
                #print "--------------------"

		# division number detection (these get through the speaker detection regexp)
		if redivno.match(fss):
			fout.write(fss.encode("latin-1"))
			continue

		# CORRECTION title (these also get through) -- both these are surrounded by <center> tags usually.
		if fss == "<b>CORRECTION</b>":
			fout.write(fss.encode("latin-1"))
			continue

		# speaker detection
		speakerg = respeakervals.match(fss)
		if speakerg:
			# optional parts of the group
			# we can use oqnum to detect oral questions
			anamestamp = speakerg.group(4) or speakerg.group(3) or ""
			oqnum = speakerg.group(1)
			if speakerg.group(5):
				assert not oqnum
				oqnum = speakerg.group(5)
			if oqnum:
				oqnum = ' oral-qnum="%s"' % oqnum
			else:
				oqnum = ""

			# the preceding square bracket qnums
			sqbnum = speakerg.group(2) or ""

			party = speakerg.group(8) or speakerg.group(10)

			spstr = string.strip(speakerg.group(6))
			spstrbrack = speakerg.group(7) or speakerg.group(9) # the bracketted phrase (sometimes the constituency or name if it is a minister)
                        if spstrbrack:
                                spstrbrack = re.sub("\n", ' ', spstrbrack)

			# do quick substitution for dep speakers in westminster hall
			if typ == "westminhall" and re.search("deputy[ \-]speaker(?i)", spstr) and not spstrbrack:
				#spstrbrack = depspeaker
				spstr = depspeaker

			# match the member to a unique identifier and displayname
			try:
				#print "spstr", spstr, ",", spstrbrack
				#print speakerg.groups()
				result = memberList.matchdebatename(spstr, spstrbrack, sdate, typ)
			except Exception, e:
				# add extra stamp info to the exception
				raise ContextException(str(e), stamp=stampurl, fragment=fss)

			# put record in this place
			#print "ree", result.encode("latin-1")
			spxm = '%s<speaker %s%s>%s</speaker>\n%s' % (anamestamp, result.encode("latin-1"), oqnum, spstr, sqbnum)
			fout.write(spxm)
			continue


		# nothing detected
		# check if we've missed anything obvious
		if recomb.match(fss):
			raise ContextException('regexpvals not general enough', fragment=fss, stamp=stampurl)
		if remarginal.search(fss):
			raise ContextException(' marginal speaker detection case: %s' % remarginal.search(fss).group(0), fragment=fss, stamp=stampurl)

		# this is where we phase in the ascii encoding
		fout.write(fss)
Esempio n. 4
0
def FilterDebateSpeakers(fout, text, sdate, typ):

	if typ == "westminhall":
		depspeakerrg = re.search("\[(.*?)(?:<i>)? ?in the Chair(?:</i>)?\]", text)
		if not depspeakerrg:
			raise ContextException("Can't find the [... in the Chair] phrase")
					
		depspeaker = depspeakerrg.group(1)

	# old style fixing (before patches existed)
	if typ == "debate":
		text = ApplyFixSubstitutions(text, sdate, fixsubs)

        # for error messages
	stampurl = StampUrl(sdate)

        # Fix missing bold tags around names
        missingbolds = re.findall('(\n?<p>(?:<stamp aname="[^"]+"/>)+)((?:<b></b>)?\s*)([A-Za-z.\-\s]+)((?:\([^)]*\)\s*)*)(:\s)', text)
        for p1,p2,p3,p4,p5 in missingbolds:
                missingbold = "%s%s%s%s%s" % (p1,p2,p3,p4,p5)
                bold = "%s<b>%s%s%s</b>" % (p1,p3,p4,p5)
                namematches = memberList.fullnametoids(p3, sdate)
                if namematches:
                        if not missingbold in text:
                                print "ERROR: missing bold text found, but then vanished when replacing"
                        text = text.replace(missingbold, bold)

        # Move Urgent Question out of speaker name
        urgentqns = re.findall('(<p>(?:<stamp aname="[^"]+"/>\s*)+)(<b>[^<]*?)(\s*<i>\s*\(Urgent Question\)</i>\s*)(:</b>)(?i)', text)
        for p1,p2,p3,p4 in urgentqns:
                urgentqn = "%s%s%s%s" % (p1,p2,p3,p4)
                correction = "%s%s%s%s" % (p1,p2,p4,p3)
                text = text.replace(urgentqn, correction)

	# setup for scanning through the file.
	for fss in recomb.split(text):
		stampurl.UpdateStampUrl(fss)
                #print fss
                #print "--------------------"

		# division number detection (these get through the speaker detection regexp)
		if redivno.match(fss) or retabletext.match(fss):
			fout.write(fss.encode("latin-1"))
			continue

		# CORRECTION title (these also get through) -- both these are surrounded by <center> tags usually.
		if fss == "<b>CORRECTION</b>":
			fout.write(fss.encode("latin-1"))
			continue

                if re.match('<b>(&#8220;)?([0-9]+[A-Z]* .*|(CHAPTER|PART) [0-9]+[A-Z]*|[A-Z, ]+)</b>$', fss):
		        fout.write(fss)
                        continue

		# speaker detection
		speakerg = respeakervals.match(fss)
		if speakerg:
			# optional parts of the group
			# we can use oqnum to detect oral questions
			anamestamp = speakerg.group(4) or speakerg.group(3) or ""
			oqnum = speakerg.group(1)
			if speakerg.group(5):
				assert not oqnum
				oqnum = speakerg.group(5)
			if oqnum:
				oqnum = ' oral-qnum="%s"' % oqnum
			else:
				oqnum = ""

			# the preceding square bracket qnums
			sqbnum = speakerg.group(2) or ""

			party = speakerg.group(8) or speakerg.group(10)

			spstr = string.strip(speakerg.group(6))
			spstrbrack = speakerg.group(7) or speakerg.group(9) # the bracketted phrase (sometimes the constituency or name if it is a minister)
                        if spstrbrack:
                                spstrbrack = re.sub("\n", ' ', spstrbrack)

			# do quick substitution for dep speakers in westminster hall
			if typ == "westminhall" and re.search("deputy[ \-]speaker(?i)", spstr) and not spstrbrack:
				#spstrbrack = depspeaker
				spstr = depspeaker

			# match the member to a unique identifier and displayname
			try:
				#print "spstr", spstr, ",", spstrbrack
				#print speakerg.groups()
				result = memberList.matchdebatename(spstr, spstrbrack, sdate, typ)
			except Exception, e:
				# add extra stamp info to the exception
				raise ContextException(str(e), stamp=stampurl, fragment=fss)

			# put record in this place
			#print "ree", result.encode("latin-1")
			spxm = '%s<speaker %s%s>%s</speaker>\n%s' % (anamestamp, result.encode("latin-1"), oqnum, spstr, sqbnum)
			fout.write(spxm)
			continue


		# nothing detected
		# check if we've missed anything obvious
		if recomb.match(fss):
			raise ContextException('regexpvals not general enough', fragment=fss, stamp=stampurl)
		if remarginal.search(fss):
			raise ContextException(' marginal speaker detection case: %s' % remarginal.search(fss).group(0), fragment=fss, stamp=stampurl)

		# this is where we phase in the ascii encoding
		fout.write(fss)