def StripWestminhallHeadings(headspeak, sdate): # check and strip the first two headings in as much as they are there ih = 0 ih = StripDebateHeading('Initial', ih, headspeak) # Westminster Hall ih = StripDebateHeading('westminster hall(?i)', ih, headspeak) # date line givendate = re.sub('</?i>',' ', headspeak[ih][0]) gd = re.match('(?:<stamp aname="[^"]*"/>)*(.*)$(?i)', givendate) if gd: givendate = gd.group(1) if ((sdate != mx.DateTime.DateTimeFrom(givendate).date)) or headspeak[ih][2]: raise Exception, 'date heading %s mismatches with date %s' % (repr(headspeak[ih]), sdate) ih = ih + 1 # next line is: # <H3><center>[Mr. John McWilliam in the Chair]</center></H3> # but we leave it as a title. # find the url, colnum and time stamps that occur before anything else in the unspoken text stampurl = StampUrl(sdate) stampurl.timestamp = '<stamp time="%s"/>' % "unknown" for j in range(0, ih): stampurl.UpdateStampUrl(headspeak[j][1]) if (not stampurl.stamp) or (not stampurl.pageurl): raise Exception, ' missing stamp url at beginning of file ' return (ih, stampurl)
def StripWestminhallHeadings(headspeak, sdate): # check and strip the first two headings in as much as they are there ih = 0 ih = StripDebateHeading('Initial', ih, headspeak) # Westminster Hall ih = StripDebateHeading('westminster hall(?i)', ih, headspeak) # date line givendate = re.sub('</?i>', ' ', headspeak[ih][0]) gd = re.match('(?:<stamp aname="[^"]*"/>)*(.*)$(?i)', givendate) if gd: givendate = gd.group(1) if ((sdate != mx.DateTime.DateTimeFrom(givendate).date)) or headspeak[ih][2]: raise Exception, 'date heading %s mismatches with date %s' % (repr( headspeak[ih]), sdate) ih = ih + 1 # next line is: # <H3><center>[Mr. John McWilliam in the Chair]</center></H3> # but we leave it as a title. # find the url, colnum and time stamps that occur before anything else in the unspoken text stampurl = StampUrl(sdate) stampurl.timestamp = '<stamp time="%s"/>' % "unknown" for j in range(0, ih): stampurl.UpdateStampUrl(headspeak[j][1]) if (not stampurl.stamp) or (not stampurl.pageurl): raise Exception, ' missing stamp url at beginning of file ' return (ih, stampurl)
def FilterWMSSpeakers(fout, text, sdate): stampurl = StampUrl(sdate) for fss in recomb.split(text): stampurl.UpdateStampUrl(fss) # speaker detection speakerg = respeakervals.match(fss) if speakerg: anamestamp = speakerg.group(1) or speakerg.group(2) or "" spstr = string.strip(speakerg.group(3)) spstrbrack = speakerg.group(4) if not spstr: continue try: #print "spstr", spstr, ",", spstrbrack result = memberList.matchwmsname(spstr, spstrbrack, sdate) except Exception, e: raise ContextException(str(e), stamp=stampurl, fragment=fss) # put record in thisplace spxm = '%s<speaker %s>%s</speaker>\n' % (anamestamp, result.encode("latin-1"), spstr) fout.write(spxm) continue # nothing detected # check if we've missed anything obvious if recomb.match(fss): raise ContextException('regexpvals not general enough', fragment=fss, stamp=stampurl) if remarginal.search(fss): raise ContextException(' marginal speaker detection case: %s' % remarginal.search(fss).group(0), fragment=fss, stamp=stampurl) fout.write(fss)
def StripWransHeadings(headspeak, sdate): # check and strip the first two headings in as much as they are there i = 0 if (headspeak[i][0] != 'Initial') or headspeak[i][2]: print headspeak[0] raise ContextException('non-conforming Initial heading ') i += 1 # import pdb;pdb.set_trace() if (not re.match( '(?:<stamp aname="[^"]*"/>)*written answers?(?: to questions?)?(?i)', headspeak[i][0])) or headspeak[i][2]: if not re.match('The following answers were received.*', headspeak[i][0]): pass # print headspeak[i] else: i += 1 givendate = string.replace(headspeak[i][0], " ", " ") givendate = re.sub("</?i>", "", givendate) gd = re.match('(?:<stamp aname="[^"]*"/>)*(.*)$', givendate) if gd: givendate = gd.group(1) if (not re.match('(?i)(?:<stamp[^>]*>)*(?:<i>)?\s*(?:The following answers were|Answers) received.*', headspeak[i][0]) and not re.match('(?:<stamp[^>]*>)?The following question was answered on.*', headspeak[i][0]) and \ (sdate != mx.DateTime.DateTimeFrom(givendate).date)) or headspeak[i][2]: if (not parlPhrases.wransmajorheadings.has_key( headspeak[i][0])) or headspeak[i][2]: print headspeak[i] raise ContextException('non-conforming second heading', stamp=None, fragment=headspeak[i][0]) else: i += 1 # find the url and colnum stamps that occur before anything else stampurl = StampUrl(sdate) for j in range(0, i): stampurl.UpdateStampUrl(headspeak[j][0]) stampurl.UpdateStampUrl(headspeak[j][1]) # Later editions seem to miss first column number, sigh if not stampurl.stamp: for speeches in headspeak: text = ''.join([speech[1] for speech in speeches[2]]) m = re.search('colnum="(\d+)W"', text) if m: stampurl.UpdateStampUrl('<stamp coldate="%s" colnum="%dW"/>' % (sdate, int(m.group(1)) - 1)) break if not stampurl.stamp or not stampurl.pageurl or not stampurl.aname: raise ContextException('missing stamp url at beginning of file') return (i, stampurl)
def StripLordsDebateHeadings(headspeak, sdate): # check and strip the first two headings in as much as they are there ih = 0 ih = StripDebateHeading('Initial', ih, headspeak) # House of Lords ih = StripDebateHeading('house of lords(?i)', ih, headspeak, True) # Thursday, 18th December 2003. mdateheading = re.match('(?:<stamp aname="[^"]*"/>)*([\w\s\d,]*)\.?', headspeak[ih][0]) #time = TimeProcessing(timeg.group(1), previoustime, False, stampurl) #fout.write('<stamp time="%s"/>' % time) if not mdateheading or (sdate != mx.DateTime.DateTimeFrom( mdateheading.group(1)).date) or headspeak[ih][2]: print headspeak[ih] #raise ContextException('non-conforming date heading') # recoverable? else: ih = ih + 1 if re.match( '(?:<stamp aname="[^"]*"/>)*(?:THE )?(?i)QUEEN(?:\'|&....;)S SPEECH', headspeak[ih][0]): print headspeak[ih][0] print "QUEENS SPEECH" # don't advance, because this is the heading (works for 2005-05-17) elif re.match("Parliament", headspeak[ih][0]): print "parliamentparliament" # don't advance; this is a title (works for 2005-05-11) else: #<H4><center>Reassembling after the Christmas Recess, the House met at half-past two of the clock: The LORD CHANCELLOR on the Woolsack.</center></H4> # The House met at eleven of the clock (Prayers having been read earlier at the Judicial Sitting by the Lord Bishop of St Albans): The CHAIRMAN OF COMMITTEES on the Woolsack. ih = StripDebateHeading( '(?:reassembling.*?recess, )?the house (?:met|resumed)(?: for Judicial Business)? at ([^(]*)(?i)', ih, headspeak, True) #print starttime. (we should use the "Half past two" business in house met to set it, unfortunately the filtercoltime has already happened # Prayers—Read by the Lord Bishop of Southwell. ih = StripDebateHeading('prayers(?i)', ih, headspeak, True) # find the url, colnum and time stamps that occur before anything else in the unspoken text stampurl = StampUrl(sdate) #stampurl.timestamp = '<stamp( time="%s")/>', starttime) # set the time from the wording 'house met at' thing. for j in range(0, ih): stampurl.UpdateStampUrl(headspeak[j][1]) if (not stampurl.stamp) or (not stampurl.pageurl): raise Exception, ' missing stamp url at beginning of file ' return (ih, stampurl)
def FilterWMSColnum(fout, text, sdate): stamp = StampUrl(sdate) # for error messages colnum = -1 for fss in recomb.split(text): #import pdb;pdb.set_trace() columng = recolumnumvals.match(fss) if columng: ldate = mx.DateTime.DateTimeFrom(columng.group(1)).date if sdate != ldate: raise ContextException("Column date disagrees %s -- %s" % (sdate, fss), fragment=fss, stamp=stamp) lcolnum = string.atoi(columng.group(2)) if (colnum == -1) or (lcolnum == colnum + 1): pass # good elif lcolnum < colnum: raise ContextException("Colnum not incrementing %d -- %s" % (lcolnum, fss), fragment=fss, stamp=stamp) colnum = lcolnum stamp.stamp = '<stamp coldate="%s" colnum="%sWS"/>' % (sdate, lcolnum) fout.write(' ') fout.write(stamp.stamp) continue columncontg = recolnumcontvals.match(fss) if columncontg: ldate = mx.DateTime.DateTimeFrom(columncontg.group(1)).date if sdate != ldate: raise ContextException("Cont column date disagrees %s -- %s" % (sdate, fss), fragment=fss, stamp=stamp) lcolnum = string.atoi(columncontg.group(2)) if colnum != lcolnum: raise ContextException("Cont column number disagrees %d -- %s" % (colnum, fss), fragment=fss, stamp=stamp) continue # anchor names from HTML <a name="xxx"> anameg = reanamevals.match(fss) if anameg: aname = anameg.group(1) stamp.aname = '<stamp aname="%s"/>' % aname fout.write(stamp.aname) continue # nothing detected # check if we've missed anything obvious if recomb.match(fss): raise ContextException('regexpvals not general enough', fragment=fss, stamp=stamp) #if remarginal.search(fss): # raise ContextException('marginal colnum detection case', # fragment=remarginal.search(fss).group(0), # stamp=stamp) fout.write(fss)
def LordsFilterSpeakers(fout, text, sdate): stampurl = StampUrl(sdate) officematches = {} # setup for scanning through the file. for fss in respeaker.split(text): # strip off the bolds tags # get rid of non-bold stuff bffs = respeakerb.match(fss) if not bffs: fout.write(fss) stampurl.UpdateStampUrl(fss) continue stampurl.UpdateStampUrl(fss) # grab a trailing colon if there is one fssb = bffs.group(1) if bffs.group(2): fssb = fssb + ":" # Remove the cruft fssb = re.sub('<stamp aname="[^"]*"/>', '', fssb) fssb = re.sub('</b><b>', '', fssb) # empty bold phrase if not re.search('\S', fssb): continue # division/contents/amendment which means this is not a speaker if renonspek.search(fssb): fout.write(fss) continue # part of quotes as an inserted title in an amendment if re.match('("|\[|")', fssb): fout.write(fss) continue # another title type (all caps), or a clause number if not re.search('[a-z]', fssb): fout.write(fss) continue # start piecing apart the name by office and leadout type namec = respeakervals.match(fssb) if not namec: print '*', fssb, '*' raise ContextException("bad format", stamp=stampurl, fragment=fssb) if namec.group('bracket'): name = re.sub('\s+', ' ', namec.group('bracket')) loffice = re.sub('\s+', ' ', namec.group('name')) else: name = re.sub('\s+', ' ', namec.group('name')) loffice = None colon = namec.group('colon') if not colon: colon = "" # get rid of some standard ones if re.match('the lord chancellor|noble lords|a noble lord|a noble baroness|the speaker(?i)', name): fout.write('<speaker person_id="%s" speakername="%s">%s</speaker>' % ('unknown', name, name)) continue # map through any office information if loffice: if (not re.match("The (Deputy |Minister of State)", loffice)) and (loffice in officematches): if sdate!='2014-09-26' and sdate!='2012-09-24' and officematches[loffice] != name: raise ContextException("office inconsistency, loffice: %s name: %s officematches: %s" % (loffice, name, officematches[loffice]), stamp=stampurl, fragment=fssb) else: officematches[loffice] = name elif name in officematches: loffice = name name = officematches[loffice] if regenericspeak.match(name): fout.write('<speaker person_id="%s" speakername="%s">%s</speaker>' % ('unknown', name, name)) continue lsid = lordsList.GetLordIDfname(name, loffice=loffice, sdate=sdate, stampurl=stampurl) # maybe throw the exception on the outside if not lsid: fout.write('<speaker person_id="unknown" error="No match" speakername="%s" colon="%s">%s</speaker>' % (name, colon, name)) else: fout.write('<speaker person_id="%s" speakername="%s" colon="%s">%s</speaker>' % (lsid, name, colon, name)) if namec.group('maiden'): fout.write('<i>%s</i>' % namec.group('maiden'))
def FilterDebateColTime(fout, text, sdate, typ): # old style fixing (before patches existed) if typ == "debate": text = ApplyFixSubstitutions(text, sdate, fixsubs) stamp = StampUrl(sdate) # for error messages btodaytype = re.match('<pagex [^>]*type="today"', text) if btodaytype: fout.write('<stamp colnum="000"/>\n') colnum = -1 previoustime = [] for fss in recomb.split(text): # column number type columng = recolumnumvals.match(fss) if columng: assert not btodaytype # no columns in today # check date ldate = mx.DateTime.DateTimeFrom(columng.group(1)).date if sdate != ldate: raise ContextException("Column date disagrees %s -- %s" % (sdate, fss), stamp=stamp, fragment=fss) # check number lcolnum = string.atoi(columng.group(2)) if lcolnum == colnum - 1: pass # spurious decrementing of column number stamps elif (colnum == -1) or (lcolnum == colnum + 1): pass # good # column numbers do get skipped during division listings elif lcolnum < colnum: raise ContextException("Colnum not incrementing %d smaller than %d -- %s" % (lcolnum, colnum, fss), stamp=stamp, fragment=fss) # write a column number stamp (has to increase no matter what) if lcolnum > colnum: colnum = lcolnum stamp.stamp = '<stamp coldate="%s" colnum="%sW"/>' % (sdate, lcolnum) fout.write('<stamp coldate="%s" colnum="%s"/>' % (sdate, colnum)) continue columncg = recolnumcontvals.match(fss) if columncg: ldate = mx.DateTime.DateTimeFrom(columncg.group(1)).date if sdate != ldate: raise ContextException("Column date disagrees %s -- %s" % (sdate, fss), stamp=stamp, fragment=fss) lcolnum = string.atoi(columncg.group(2)) if colnum != lcolnum and sdate<'2006-05-08': raise ContextException("Cont column number disagrees %d -- %s" % (colnum, fss), stamp=stamp, fragment=fss) continue timeg = retimevals.match(fss) if timeg: time = TimeProcessing(timeg.group(1), previoustime, (timeg.group(0)[0] == '['), stamp) if not time: raise ContextException("Time not matched: " + timeg.group(1), stamp=stamp, fragment=fss) fout.write('<stamp time="%s"/>' % time) previoustime.append(time) continue # anchor names from HTML <a name="xxx"> anameg = reanamevals.match(fss) if anameg: aname = anameg.group(1) stamp.aname = '<stamp aname="%s"/>' % aname fout.write('<stamp aname="%s"/>' % aname) continue # nothing detected # check if we've missed anything obvious if recomb.match(fss): print "$$$", fss, "$$$" print regcolnumcont print re.match(regcolnumcont + "(?i)", fss) raise ContextException('regexpvals not general enough', stamp=stamp, fragment=fss) if remarginal.search(fss): print fss print '--------------------------------\n' print "marginal found: ", remarginal.search(fss).groups() print "zeroth: ", remarginal.search(fss).group(0) print '--------------------------------\n' raise ContextException('marginal coltime/a detection case', stamp=stamp, fragment=fss) fout.write(fss)
def StripDebateHeadings(headspeak, sdate): # check and strip the first two headings in as much as they are there ih = 0 ih = StripDebateHeading('Initial', ih, headspeak) # the 'Initial' is inserted by the splitheadingsspeakers function # volume type heading if re.search('THE$', headspeak[ih][0]): ih = StripDebateHeading('THE', ih, headspeak) ih = StripDebateHeading('PARLIAMENTARY(?: )+DEBATES', ih, headspeak) elif re.search('THE PARLIAMENTARY DEBATES', headspeak[ih][0]): ih = StripDebateHeading('THE PARLIAMENTARY DEBATES', ih, headspeak) if re.search('OFFICIAL REPORT', headspeak[ih][0]): ih = StripDebateHeading('OFFICIAL REPORT', ih, headspeak) ih = StripDebateHeading('IN THE .*? SESSION OF THE .*? PARLIAMENT OF THE', ih, headspeak, True) ih = StripDebateHeading('UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND', ih, headspeak, True) ih = StripDebateHeading('\[WHICH OPENED .*?\]', ih, headspeak, True) ih = StripDebateHeading('.*? YEAR OF THE REIGN OF.*?', ih, headspeak, True) ih = StripDebateHeading('HER MAJESTY QUEEN ELIZABETH II', ih, headspeak, True) ih = StripDebateHeading('SI.*? SERIES.*?VOLUME \d+', ih, headspeak, True) ih = StripDebateHeading('SI.*? SERIES', ih, headspeak, True) ih = StripDebateHeading('VOLUME \d+', ih, headspeak, True) ih = StripDebateHeading('.*? VOLUME OF SESSION .*?', ih, headspeak) #House of Commons ih = StripDebateHeading('house of commons(?i)', ih, headspeak) # Tuesday 9 December 2003 if not re.match('the house met at .*(?i)', headspeak[ih][0]): givendate = re.sub(' ',' ',headspeak[ih][0]) givendate = re.sub('</?i>',' ', givendate) gd = re.match('(?:<stamp aname="[^"]*"/>)*(.*)$(?i)', givendate) if gd: givendate = gd.group(1) if ((sdate != mx.DateTime.DateTimeFrom(givendate).date)) or headspeak[ih][2]: raise Exception, 'date heading %s mismatches with date %s' % (repr(headspeak[ih]), sdate) ih = ih + 1 gstarttime = None if sdate != "2001-06-13": #The House met at half-past Ten o'clock gstarttime = re.match('(?:<stamp aname="[^"]*"/>)*(?:<i>)?\s*the\s+house (?:being |having )?met at?\s+(?:</i><i>\s*)?(.*?)(?:, and the Speaker-Elect having taken the Chair;)?(?:</i>)?$(?i)', headspeak[ih][0]) if (not gstarttime) or headspeak[ih][2]: raise ContextException('non-conforming "the house met at" heading %s' % repr(headspeak[ih]), "") ih = ih + 1 # Start of a new parliament is special if sdate not in ["2001-06-14", "2001-06-13", "2005-05-11", "2005-05-12"]: #PRAYERS ih = StripDebateHeading('prayers(?i)', ih, headspeak, True) ih = StripDebateHeading('pursuant to the Standing Order\.', ih, headspeak, True) # in the chair ih = StripDebateHeading('\[.*?[ >]in the chair[<>i/\.]*\](?i)', ih, headspeak, True) # find the url, colnum and time stamps that occur before anything else in the unspoken text stampurl = StampUrl(sdate) # set the time from the wording 'house met at' thing. if gstarttime: time = gstarttime.group(1) time = re.sub('</?i>',' ', time) time = re.sub('\s+',' ', time) if re.match("half-past Nine(?i)", time): newtime = '09:30:00' elif re.match("a quarter to Ten o(?i)", time): newtime = '09:45:00' elif re.match("Ten o'clock(?i)", time): newtime = '10:00:00' elif re.match("half-past Ten(?i)", time): newtime = '10:30:00' elif re.match("Eleven o'clock(?i)", time): newtime = '11:00:00' elif re.match("twenty-five minutes past\s*Eleven(?i)", time): newtime = '11:25:00' elif re.match("twenty-six minutes past\s*Eleven(?i)", time): newtime = '11:26:00' elif re.match("twenty-nine minutes past\s*Eleven(?i)", time): newtime = '11:29:00' elif re.match("half-past Eleven(?i)", time): newtime = '11:30:00' elif re.match("Twelve noon(?i)", time): newtime = '12:00:00' elif re.match("half-past One(?i)", time): newtime = '13:30:00' elif re.match("half-past Two(?i)", time): newtime = '14:30:00' elif re.match("twenty minutes to Three(?i)", time): newtime = '14:40:00' elif re.match("10 minutes past Three(?i)", time): newtime = '15:10:00' elif re.match("Six o'clock(?i)", time): newtime = '18:00:00' else: raise ContextException, "Start time not known: " + time stampurl.timestamp = '<stamp time="%s"/>' % newtime for j in range(0, ih): stampurl.UpdateStampUrl(headspeak[j][1]) if (not stampurl.stamp) or (not stampurl.pageurl): raise Exception, ' missing stamp url at beginning of file ' return (ih, stampurl)
def FilterWransColnum(fout, text, sdate): # Legacy individual substitution rules text = ApplyFixSubstitutions(text, sdate, fixsubs) # Remove junk text = text.replace("{**con**}{**/con**}", "") stamp = StampUrl(sdate) # for error messages colnum = -1 for fss in recomb.split(text): columng = recolumnumvals.match(fss) if columng: ldate = mx.DateTime.DateTimeFrom(columng.group(1)).date if sdate != ldate: raise ContextException("Column date disagrees %s -- %s" % (sdate, fss), fragment=fss, stamp=stamp) lcolnum = string.atoi(columng.group(2)) if (colnum == -1) or (lcolnum == colnum + 1): pass # good elif lcolnum < colnum: raise ContextException("Colnum not incrementing %d -- %s" % (lcolnum, fss), fragment=fss, stamp=stamp) # column numbers do get skipped during division listings colnum = lcolnum stamp.stamp = '<stamp coldate="%s" colnum="%sW"/>' % (sdate, lcolnum) fout.write(' ') fout.write(stamp.stamp) continue columncontg = recolnumcontvals.match(fss) if columncontg: ldate = columncontg.group(1) or columncontg.group(3) or None lcolnum = columncontg.group(2) or columncontg.group(4) or None if ldate: ldate = mx.DateTime.DateTimeFrom(ldate).date if sdate != ldate: raise ContextException( "Cont column date disagrees %s -- %s" % (sdate, fss), fragment=fss, stamp=stamp) lcolnum = string.atoi(lcolnum) if colnum != lcolnum and sdate < '2006-05-08': raise ContextException( "Cont column number disagrees %d -- %s" % (colnum, fss), fragment=fss, stamp=stamp) # no need to output anything fout.write(' ') continue if columncontg.group(5): lcolnum = string.atoi(columncontg.group(5)) if colnum != lcolnum and colnum != lcolnum + 1: raise ContextException( "Cont column number disagrees %d -- %s" % (colnum, fss), fragment=fss, stamp=stamp) fout.write(' ') continue if columncontg.group(6): lcolnum = string.atoi(columncontg.group(6)) if colnum + 1 != lcolnum: raise ContextException( "Cont column number disagrees %d -- %s" % (colnum, fss), fragment=fss, stamp=stamp) colnum = lcolnum stamp.stamp = '<stamp coldate="%s" colnum="%sW"/>' % (sdate, lcolnum) fout.write(' ') fout.write(stamp.stamp) continue # anchor names from HTML <a name="xxx"> anameg = reanamevals.match(fss) if anameg: aname = anameg.group(1) stamp.aname = '<stamp aname="%s"/>' % aname fout.write(stamp.aname) continue # nothing detected # check if we've missed anything obvious if recomb.match(fss): raise ContextException('regexpvals not general enough', fragment=fss, stamp=stamp) # Removed FAI 2007-05-25, I really don't care! #if remarginal.search(fss): # raise ContextException('marginal colnum detection case', # fragment=remarginal.search(fss).group(0), # stamp=stamp) fout.write(fss)
def FilterDebateColTime(fout, text, sdate, typ): # old style fixing (before patches existed) if typ == "debate": text = ApplyFixSubstitutions(text, sdate, fixsubs) stamp = StampUrl(sdate) # for error messages btodaytype = re.match('<pagex [^>]*type="today"', text) if btodaytype: fout.write('<stamp colnum="000"/>\n') colnum = -1 previoustime = [] for fss in recomb.split(text): # column number type columng = recolumnumvals.match(fss) if columng: assert not btodaytype # no columns in today # check date ldate = mx.DateTime.DateTimeFrom(columng.group(1)).date if sdate != ldate: raise ContextException("Column date disagrees %s -- %s" % (sdate, fss), stamp=stamp, fragment=fss) # check number lcolnum = string.atoi(columng.group(2)) if lcolnum == colnum - 1: pass # spurious decrementing of column number stamps elif (colnum == -1) or (lcolnum == colnum + 1): pass # good # column numbers do get skipped during division listings elif lcolnum < colnum: raise ContextException( "Colnum not incrementing %d smaller than %d -- %s" % (lcolnum, colnum, fss), stamp=stamp, fragment=fss) # write a column number stamp (has to increase no matter what) if lcolnum > colnum: colnum = lcolnum stamp.stamp = '<stamp coldate="%s" colnum="%sW"/>' % (sdate, lcolnum) fout.write('<stamp coldate="%s" colnum="%s"/>' % (sdate, colnum)) continue columncg = recolnumcontvals.match(fss) if columncg: ldate = mx.DateTime.DateTimeFrom(columncg.group(1)).date if sdate != ldate: raise ContextException("Column date disagrees %s -- %s" % (sdate, fss), stamp=stamp, fragment=fss) lcolnum = string.atoi(columncg.group(2)) if colnum != lcolnum and sdate < '2006-05-08': raise ContextException( "Cont column number disagrees %d -- %s" % (colnum, fss), stamp=stamp, fragment=fss) continue timeg = retimevals.match(fss) if timeg: time = TimeProcessing(timeg.group(1), previoustime, (timeg.group(0)[0] == '['), stamp) if not time: raise ContextException("Time not matched: " + timeg.group(1), stamp=stamp, fragment=fss) fout.write('<stamp time="%s"/>' % time) previoustime.append(time) continue # anchor names from HTML <a name="xxx"> anameg = reanamevals.match(fss) if anameg: aname = anameg.group(1) stamp.aname = '<stamp aname="%s"/>' % aname fout.write('<stamp aname="%s"/>' % aname) continue # nothing detected # check if we've missed anything obvious if recomb.match(fss): print "$$$", fss, "$$$" print regcolnumcont print re.match(regcolnumcont + "(?i)", fss) raise ContextException('regexpvals not general enough', stamp=stamp, fragment=fss) if remarginal.search(fss): print fss print '--------------------------------\n' print "marginal found: ", remarginal.search(fss).groups() print "zeroth: ", remarginal.search(fss).group(0) print '--------------------------------\n' raise ContextException('marginal coltime/a detection case', stamp=stamp, fragment=fss) fout.write(fss)
def FilterWransColnum(fout, text, sdate): # Legacy individual substitution rules text = ApplyFixSubstitutions(text, sdate, fixsubs) # Remove junk text = text.replace("{**con**}{**/con**}", "") stamp = StampUrl(sdate) # for error messages colnum = -1 for fss in recomb.split(text): columng = recolumnumvals.match(fss) if columng: ldate = mx.DateTime.DateTimeFrom(columng.group(1)).date if sdate != ldate: raise ContextException("Column date disagrees %s -- %s" % (sdate, fss), fragment=fss, stamp=stamp) lcolnum = string.atoi(columng.group(2)) if (colnum == -1) or (lcolnum == colnum + 1): pass # good elif lcolnum < colnum: raise ContextException("Colnum not incrementing %d -- %s" % (lcolnum, fss), fragment=fss, stamp=stamp) # column numbers do get skipped during division listings colnum = lcolnum stamp.stamp = '<stamp coldate="%s" colnum="%sW"/>' % (sdate, lcolnum) fout.write(" ") fout.write(stamp.stamp) continue columncontg = recolnumcontvals.match(fss) if columncontg: ldate = columncontg.group(1) or columncontg.group(3) or None lcolnum = columncontg.group(2) or columncontg.group(4) or None if ldate: ldate = mx.DateTime.DateTimeFrom(ldate).date if sdate != ldate: raise ContextException( "Cont column date disagrees %s -- %s" % (sdate, fss), fragment=fss, stamp=stamp ) lcolnum = string.atoi(lcolnum) if colnum != lcolnum and sdate < "2006-05-08": raise ContextException( "Cont column number disagrees %d -- %s" % (colnum, fss), fragment=fss, stamp=stamp ) # no need to output anything fout.write(" ") continue if columncontg.group(5): lcolnum = string.atoi(columncontg.group(5)) if colnum != lcolnum and colnum != lcolnum + 1: raise ContextException( "Cont column number disagrees %d -- %s" % (colnum, fss), fragment=fss, stamp=stamp ) fout.write(" ") continue if columncontg.group(6): lcolnum = string.atoi(columncontg.group(6)) if colnum + 1 != lcolnum: raise ContextException( "Cont column number disagrees %d -- %s" % (colnum, fss), fragment=fss, stamp=stamp ) colnum = lcolnum stamp.stamp = '<stamp coldate="%s" colnum="%sW"/>' % (sdate, lcolnum) fout.write(" ") fout.write(stamp.stamp) continue # anchor names from HTML <a name="xxx"> anameg = reanamevals.match(fss) if anameg: aname = anameg.group(1) stamp.aname = '<stamp aname="%s"/>' % aname fout.write(stamp.aname) continue # nothing detected # check if we've missed anything obvious if recomb.match(fss): raise ContextException("regexpvals not general enough", fragment=fss, stamp=stamp) # Removed FAI 2007-05-25, I really don't care! # if remarginal.search(fss): # raise ContextException('marginal colnum detection case', # fragment=remarginal.search(fss).group(0), # stamp=stamp) fout.write(fss)
def FilterWransSpeakers(fout, text, sdate): text = ApplyFixSubstitutions(text, sdate, fixsubs) # Fix things like this, to put bold in. We use bold below to detect names, but # occasionally the reporters miss it out, and we catch such cases here: # <p><a name="qnpa_0">Caroline Flint: This information is not held centrally. </p> # <p><a name="qnpa_15">Ms Harman: The information can be found in the following table. </p> missingbolds = re.findall( '(\n?<p>(?:<stamp aname="[^"]+"/>)+)((?:<b></b>)?\s*)([A-Za-z.\-\s]+)(:\s)', text) for p1, p2, p3, p4 in missingbolds: missingbold = "%s%s%s%s" % (p1, p2, p3, p4) bold = "%s<b>%s%s</b>" % (p1, p3, p4) namematches = memberList.fullnametoids(p3, sdate) # Only fix if we found a matching name in the middle (and do it even if ambiguous) if namematches: #print "Fixing missing bold, had name matches:\n\t%s\n\t%s" % (missingbold.strip(), bold.strip()) if not missingbold in text: print "ERROR: missing bold text found, but then vanished when replacing" text = text.replace(missingbold, bold) #else: #print "Plausible missing bold not fixed, as no name matches:\n\t%s\n\t%s" % (missingbold.strip(), bold.strip()) # <B> Mrs. Iris Robinson: </B> lspeakerregexp = '<b>.*?</b>(?:\s*:)?' ltableregexp = '<table[^>]*>[\s\S]*?</table>' # these have bolds, so must be separated out tableregexp = ltableregexp + '(?i)' lregexp = '(%s|%s)(?i)' % (ltableregexp, lspeakerregexp) # setup for scanning through the file. fs = re.split(lregexp, text) # for error messages stampurl = StampUrl(sdate) for i in range(len(fs)): fss = fs[i] fss = stampurl.UpdateStampUrl(fss) # Speakers have new stamps in them if re.match(tableregexp, fss): continue speakerg = re.findall('<b>\s*([^:]*)[:\s]*?([^<:]*)</b>(?i)', fss) if not speakerg: continue # we have a string in bold boldnamestring = string.strip(speakerg[0][0]) # trailing text after the colon in the bold speech bit if re.search('\S', speakerg[0][1]): fs[i + 1] = speakerg[0][1] + fs[i + 1] # push the square brackets outside of the boldstring if there is one # <B> Mr. Miliband [ </B> <i>holding answer 24 March</i>]: sqb = re.findall('^([^\[]*)(\[.*)$', boldnamestring) if sqb: boldnamestring = string.strip(sqb[0][0]) fs[i + 1] = sqb[0][1] + fs[i + 1] # get rid of blank bold strings if not re.search('\S', boldnamestring): fs[i] = '' continue # try to pull in the question number if preceding # These signify aborted oral questions, and are normally # useless and at the start of the page. # 27. <B> Mr. Steen: </B> if i > 0: oqnsep = re.findall( '^([\s\S]*?)Q?(\d+\.?)(\s*?(?:<stamp aname=".*?"/>)?)$', fs[i - 1]) if oqnsep: fs[i - 1] = oqnsep[0][0] + oqnsep[0][2] boldnamestring = oqnsep[0][1] + ' ' + boldnamestring # take out the initial digits and a dot which we may have just put in # (although sometimes it would have already been there) robj = re.match(r"(\d*\.? )(.*)$", boldnamestring) deci = None if robj: (deci, boldnamestring) = robj.groups() # TODO: do something with deci here (it is the "failed # oral questions" signifier) # see if it is an explicitly bad/ambiguous name which will never match if boldnamestring.find('<broken-name>') >= 0: person_id = 'unknown' boldnamestring = boldnamestring.replace('<broken-name>', '') remadename = ' speakername="%s" error="Name ambiguous in Hansard"' % ( boldnamestring) else: # split bracketed cons out if present brakmatch = re.match("(.*)\s+\((.*)\)", boldnamestring) if brakmatch: (name, cons) = brakmatch.groups() else: (name, cons) = (boldnamestring, None) # match the member to a unique identifier (person_id, remadename, remadecons) = memberList.matchfullnamecons(name, cons, sdate, alwaysmatchcons=False) if person_id and remadename: remadename = ' speakername="%s"' % (remadename) if not person_id: if remadename == "MultipleMatch": if boldnamestring == 'Mr. Michael Foster': if remadecons[0] == 'uk.org.publicwhip/person/10209': person_id = remadecons[0] remadename = ' speakername="Michael Foster"' remadecons = 'Worcester' else: person_id = 'unknown' remadename = ' speakername="%s" error="MultipleMatch"' % boldnamestring elif boldnamestring == 'Jim Dobbin' and sdate == '2014-09-08': person_id = 'uk.org.publicwhip/person/10170' remadename = ' speakername="Jim Dobbin"' else: print " No name,const match (%s,%s)" % (name, cons) raise ContextException("No name match", stamp=stampurl, fragment=boldnamestring) # put record in this place fs[i] = '<speaker person_id="%s"%s>%s</speaker>\n' % \ (person_id.encode("latin-1"), remadename.encode("latin-1"), boldnamestring) # scan through everything and output it into the file fout.writelines(fs)
def StripDebateHeadings(headspeak, sdate): # check and strip the first two headings in as much as they are there ih = 0 ih = StripDebateHeading( 'Initial', ih, headspeak ) # the 'Initial' is inserted by the splitheadingsspeakers function # volume type heading if re.search('THE$', headspeak[ih][0]): ih = StripDebateHeading('THE', ih, headspeak) ih = StripDebateHeading('PARLIAMENTARY(?: )+DEBATES', ih, headspeak) elif re.search('THE PARLIAMENTARY DEBATES', headspeak[ih][0]): ih = StripDebateHeading('THE PARLIAMENTARY DEBATES', ih, headspeak) if re.search('OFFICIAL REPORT', headspeak[ih][0]): ih = StripDebateHeading('OFFICIAL REPORT', ih, headspeak) ih = StripDebateHeading( 'IN THE .*? SESSION OF THE .*? PARLIAMENT OF THE', ih, headspeak, True) ih = StripDebateHeading( 'UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND', ih, headspeak, True) ih = StripDebateHeading('\[WHICH OPENED .*?\]', ih, headspeak, True) ih = StripDebateHeading('.*? YEAR OF THE REIGN OF.*?', ih, headspeak, True) ih = StripDebateHeading('HER MAJESTY QUEEN ELIZABETH II', ih, headspeak, True) ih = StripDebateHeading('SI.*? SERIES.*?VOLUME \d+', ih, headspeak, True) ih = StripDebateHeading('SI.*? SERIES', ih, headspeak, True) ih = StripDebateHeading('VOLUME \d+', ih, headspeak, True) ih = StripDebateHeading('.*? VOLUME OF SESSION .*?', ih, headspeak) #House of Commons ih = StripDebateHeading('house of commons(?i)', ih, headspeak) # Tuesday 9 December 2003 if not re.match('the house met at .*(?i)', headspeak[ih][0]): givendate = re.sub(' ', ' ', headspeak[ih][0]) givendate = re.sub('</?i>', ' ', givendate) gd = re.match('(?:<stamp aname="[^"]*"/>)*(.*)$(?i)', givendate) if gd: givendate = gd.group(1) if ((sdate != mx.DateTime.DateTimeFrom(givendate).date)) or headspeak[ih][2]: raise Exception, 'date heading %s mismatches with date %s' % (repr( headspeak[ih]), sdate) ih = ih + 1 gstarttime = None if sdate != "2001-06-13": #The House met at half-past Ten o'clock gstarttime = re.match( '(?:<stamp aname="[^"]*"/>)*(?:<i>)?\s*the\s+house (?:being |having )?met at?\s+(?:</i><i>\s*)?(.*?)(?:, and the Speaker-Elect having taken the Chair;)?(?:</i>)?$(?i)', headspeak[ih][0]) if (not gstarttime) or headspeak[ih][2]: raise ContextException( 'non-conforming "the house met at" heading %s' % repr(headspeak[ih]), "") ih = ih + 1 # Start of a new parliament is special if sdate not in ["2001-06-14", "2001-06-13", "2005-05-11", "2005-05-12"]: #PRAYERS ih = StripDebateHeading('prayers(?i)', ih, headspeak, True) ih = StripDebateHeading('pursuant to the Standing Order\.', ih, headspeak, True) # in the chair ih = StripDebateHeading('\[.*?[ >]in the chair[<>i/\.]*\](?i)', ih, headspeak, True) # find the url, colnum and time stamps that occur before anything else in the unspoken text stampurl = StampUrl(sdate) # set the time from the wording 'house met at' thing. if gstarttime: time = gstarttime.group(1) time = re.sub('</?i>', ' ', time) time = re.sub('\s+', ' ', time) if re.match("half-past Nine(?i)", time): newtime = '09:30:00' elif re.match("a quarter to Ten o(?i)", time): newtime = '09:45:00' elif re.match("Ten o'clock(?i)", time): newtime = '10:00:00' elif re.match("half-past Ten(?i)", time): newtime = '10:30:00' elif re.match("Eleven o'clock(?i)", time): newtime = '11:00:00' elif re.match("twenty-five minutes past\s*Eleven(?i)", time): newtime = '11:25:00' elif re.match("twenty-six minutes past\s*Eleven(?i)", time): newtime = '11:26:00' elif re.match("twenty-nine minutes past\s*Eleven(?i)", time): newtime = '11:29:00' elif re.match("half-past Eleven(?i)", time): newtime = '11:30:00' elif re.match("Twelve noon(?i)", time): newtime = '12:00:00' elif re.match("half-past One(?i)", time): newtime = '13:30:00' elif re.match("half-past Two(?i)", time): newtime = '14:30:00' elif re.match("twenty minutes to Three(?i)", time): newtime = '14:40:00' elif re.match("10 minutes past Three(?i)", time): newtime = '15:10:00' elif re.match("Six o'clock(?i)", time): newtime = '18:00:00' else: raise ContextException, "Start time not known: " + time stampurl.timestamp = '<stamp time="%s"/>' % newtime for j in range(0, ih): stampurl.UpdateStampUrl(headspeak[j][1]) if (not stampurl.stamp) or (not stampurl.pageurl): raise Exception, ' missing stamp url at beginning of file ' return (ih, stampurl)
def FilterDebateSpeakers(fout, text, sdate, typ): if typ == "westminhall": depspeakerrg = re.search("\[(.*?)(?:<i>)? ?in the Chair(?:</i>)?\]", text) if not depspeakerrg: raise ContextException("Can't find the [... in the Chair] phrase") depspeaker = depspeakerrg.group(1) # old style fixing (before patches existed) if typ == "debate": text = ApplyFixSubstitutions(text, sdate, fixsubs) # for error messages stampurl = StampUrl(sdate) # Fix missing bold tags around names missingbolds = re.findall('(\n?<p>(?:<stamp aname="[^"]+"/>)+)((?:<b></b>)?\s*)([A-Za-z.\-\s]+)((?:\([^)]*\)\s*)*)(:\s)', text) for p1,p2,p3,p4,p5 in missingbolds: missingbold = "%s%s%s%s%s" % (p1,p2,p3,p4,p5) bold = "%s<b>%s%s%s</b>" % (p1,p3,p4,p5) namematches = memberList.fullnametoids(p3, sdate) if namematches: if not missingbold in text: print "ERROR: missing bold text found, but then vanished when replacing" text = text.replace(missingbold, bold) # Move Urgent Question out of speaker name urgentqns = re.findall('(<p>(?:<stamp aname="[^"]+"/>\s*)+)(<b>[^<]*?)(\s*<i>\s*\(Urgent Question\)</i>\s*)(:</b>)(?i)', text) for p1,p2,p3,p4 in urgentqns: urgentqn = "%s%s%s%s" % (p1,p2,p3,p4) correction = "%s%s%s%s" % (p1,p2,p4,p3) text = text.replace(urgentqn, correction) # setup for scanning through the file. for fss in recomb.split(text): stampurl.UpdateStampUrl(fss) #print fss #print "--------------------" # division number detection (these get through the speaker detection regexp) if redivno.match(fss) or retabletext.match(fss): fout.write(fss.encode("latin-1")) continue # CORRECTION title (these also get through) -- both these are surrounded by <center> tags usually. if fss == "<b>CORRECTION</b>": fout.write(fss.encode("latin-1")) continue if re.match('<b>(“)?([0-9]+[A-Z]* .*|(CHAPTER|PART) [0-9]+[A-Z]*|[A-Z, ]+)</b>$', fss): fout.write(fss) continue # speaker detection speakerg = respeakervals.match(fss) if speakerg: # optional parts of the group # we can use oqnum to detect oral questions anamestamp = speakerg.group(4) or speakerg.group(3) or "" oqnum = speakerg.group(1) if speakerg.group(5): assert not oqnum oqnum = speakerg.group(5) if oqnum: oqnum = ' oral-qnum="%s"' % oqnum else: oqnum = "" # the preceding square bracket qnums sqbnum = speakerg.group(2) or "" party = speakerg.group(8) or speakerg.group(10) spstr = string.strip(speakerg.group(6)) spstrbrack = speakerg.group(7) or speakerg.group(9) # the bracketted phrase (sometimes the constituency or name if it is a minister) if spstrbrack: spstrbrack = re.sub("\n", ' ', spstrbrack) # do quick substitution for dep speakers in westminster hall if typ == "westminhall" and re.search("deputy[ \-]speaker(?i)", spstr) and not spstrbrack: #spstrbrack = depspeaker spstr = depspeaker # match the member to a unique identifier and displayname try: #print "spstr", spstr, ",", spstrbrack #print speakerg.groups() result = memberList.matchdebatename(spstr, spstrbrack, sdate, typ) except Exception, e: # add extra stamp info to the exception raise ContextException(str(e), stamp=stampurl, fragment=fss) # put record in this place #print "ree", result.encode("latin-1") spxm = '%s<speaker %s%s>%s</speaker>\n%s' % (anamestamp, result.encode("latin-1"), oqnum, spstr, sqbnum) fout.write(spxm) continue # nothing detected # check if we've missed anything obvious if recomb.match(fss): raise ContextException('regexpvals not general enough', fragment=fss, stamp=stampurl) if remarginal.search(fss): raise ContextException(' marginal speaker detection case: %s' % remarginal.search(fss).group(0), fragment=fss, stamp=stampurl) # this is where we phase in the ascii encoding fout.write(fss)
def FilterLordsColtime(fout, text, sdate): colnum = -1 time = '' stampurl = StampUrl(sdate) previoustime = [] for fss in recomb.split(text): # column number type # we need some very elaboirate checking to sort out the sections, by # titles that are sometimes on the wrong side of the first column, # and by colnums that miss the GC code in that section. # column numbers are also missed during divisions, and this exception # should be detected and noted. # That implies that this is the filter which detects the boundaries # between the standard four sections. columng = recolumnumvals.match(fss) if columng: # check date ldate = mx.DateTime.DateTimeFrom(columng.group(1)).date if sdate != ldate: raise ContextException("Column date disagrees %s -- %s" % (sdate, fss), stamp=stampurl, fragment=fss) # check number # ltype = columng.group(2) lcolnum = string.atoi(columng.group(3)) if lcolnum == colnum - 1: pass # spurious decrementing of column number stamps elif lcolnum == colnum: pass # spurious repeat of column number stamps # good (we get skipped columns in divisions) elif (colnum == -1) or (colnum + 1 <= lcolnum <= colnum + 5): # was 2 but this caused us to miss ones colnum = lcolnum fout.write('<stamp coldate="%s" colnum="%s%s"/>' % (sdate, colnum, "")) # column numbers do get skipped during division listings else: pass #print "Colnum not incrementing %d -- %d -- %s" % (colnum, lcolnum, fss) #raise Exception, "Colnum not incrementing %d -- %d -- %s" % (colnum, lcolnum, fss) #print (ldate, colnum, lindexstyle) continue timeg = retimevals.match(fss) if timeg: time = timeg.group(1) if not re.match('(?:</h5>|</st>)(?i)', time): time = TimeProcessing(time, previoustime, False, stampurl) fout.write('<stamp time="%s"/>' % time) if time: previoustime.append(time) continue # special lift a time out of the heading regtime3 = regtime3vals.match(fss) if regtime3: fout.write(fss) # put this heading back into the flow of text assert not previoustime lntimematch = re.match("(half[\- ]past )?(\w+)(-thirty)?$", regtime3.group(1)) lnhour = lntimematch and lntimematch.group(2) # strange way to do it, but I'm keeping tab on examples, and the transition between am and pm if lnhour == "two": lntimep = "2:%s pm" elif lnhour == "three": lntimep = "3:%s pm" elif lnhour == "six": lntimep = "6:%s pm" elif lnhour == "nine": lntimep = "9:%s am" elif lnhour == "eleven": lntimep = "11:%s am" elif lnhour == "ten": lntimep = "10:%s am" else: print "-------------'%s'" % regtime3.group(1) assert False assert not lntimematch.group(1) or not lntimematch.group(3) ntime = lntimep % ((lntimematch.group(1) or lntimematch.group(3)) and "30" or "00") time = TimeProcessing(ntime, previoustime, False, stampurl) fout.write('<stamp time="%s"/>' % time) continue # anchor names from HTML <a name="xxx"> anameg = reanamevals.match(fss) if anameg: aname = anameg.group(1) fout.write('<stamp aname="%s"/>' % aname) stampurl.aname = aname continue # nothing detected # check if we've missed anything obvious if recomb.match(fss): print "$$$", fss, "$$-$" raise ContextException(' regexpvals not general enough ', stamp=stampurl, fragment=fss) # a programming error between splitting and matching if remarginal.search(fss): print remarginal.search(fss).group(0) lregcolumnum6 = '<p>\s*</ul>\s*<a name="column_\d+">(?:</a>)?\s*<b>[^:<]*:\s*column\s*\d+\s*</b></p>\s*<ul><font size=3>(?i)' print re.findall(lregcolumnum6, fss) #print fss raise ContextException(' marginal coltime detection case ', stamp=stampurl, fragment=fss) fout.write(fss)
def FilterWMSColnum(fout, text, sdate): stamp = StampUrl(sdate) # for error messages colnum = -1 for fss in recomb.split(text): #import pdb;pdb.set_trace() columng = recolumnumvals.match(fss) if columng: ldate = mx.DateTime.DateTimeFrom(columng.group(1)).date if sdate != ldate: raise ContextException("Column date disagrees %s -- %s" % (sdate, fss), fragment=fss, stamp=stamp) lcolnum = string.atoi(columng.group(2)) if (colnum == -1) or (lcolnum == colnum + 1): pass # good elif lcolnum < colnum: raise ContextException("Colnum not incrementing %d -- %s" % (lcolnum, fss), fragment=fss, stamp=stamp) colnum = lcolnum stamp.stamp = '<stamp coldate="%s" colnum="%sWS"/>' % (sdate, lcolnum) fout.write(' ') fout.write(stamp.stamp) continue columncontg = recolnumcontvals.match(fss) if columncontg: ldate = mx.DateTime.DateTimeFrom(columncontg.group(1)).date if sdate != ldate: raise ContextException("Cont column date disagrees %s -- %s" % (sdate, fss), fragment=fss, stamp=stamp) lcolnum = string.atoi(columncontg.group(2)) if colnum != lcolnum: raise ContextException( "Cont column number disagrees %d -- %s" % (colnum, fss), fragment=fss, stamp=stamp) continue # anchor names from HTML <a name="xxx"> anameg = reanamevals.match(fss) if anameg: aname = anameg.group(1) stamp.aname = '<stamp aname="%s"/>' % aname fout.write(stamp.aname) continue # nothing detected # check if we've missed anything obvious if recomb.match(fss): raise ContextException('regexpvals not general enough', fragment=fss, stamp=stamp) #if remarginal.search(fss): # raise ContextException('marginal colnum detection case', # fragment=remarginal.search(fss).group(0), # stamp=stamp) fout.write(fss)