def regidcodes(self, minhgid, sdate, qnumsseen): # find minimal qnum which will be used as the basis self.qnums.sort() if not self.qnums: print self.headingqb.stext[0] for ques in self.queses: print ques.stext raise ContextException('missing qnums on question') basegidq = 'uk.org.publicwhip/wrans/%s.%s' % (sdate, self.qnums[0]) self.headingqb.qGID = basegidq + ".h" # this is what we link to for rqnum in self.qnums[1:]: # the mapping for the other qnums self.altheadinggids.append('uk.org.publicwhip/wrans/%s.%s.h' % (sdate, rqnum)) # renumber the parts of the question (which aren't going to be linked to anyway) for i in range(len(self.queses)): self.queses[i].qGID = "%s.q%d" % (basegidq, i) for i in range(len(self.replies)): self.replies[i].qGID = "%s.r%d" % (basegidq, i) # make sure all qnums are new for qnum in self.qnums: if qnum in qnumsseen: print "repeated qnum:", qnum raise ContextException('repeated qnum', None, qnum) qnumsseen[qnum] = 1 # this value is used for labelling the major heading. # high probability that the value is stable, but it won't be used for linking if not minhgid or (basegidq < minhgid): minhgid = basegidq return minhgid
def __init__(self, date, stex): self.lastdate = '' self.toklist = [] self.sdate = date stex = re.sub('&(?!amp;)', '&', stex) # separate out any qnums at end of paragraph self.rmqnum = reqnum.search(stex) if self.rmqnum: stex = stex[:self.rmqnum.span(0)[0]] # separate out qnums stuffed into front of paragraph (by the grabber of the speakername) frqnum = refqnum.match(stex) if frqnum: if self.rmqnum: raise ContextException( 'Found question number [%s] in para, but already found [%s] at end (this probably just means it is being quoted, and you just need to change [] to ().' % (frqnum.group(1), self.rmqnum.group(1))) self.rmqnum = frqnum stex = stex[frqnum.span(0)[1]:] stex_nohtml = re.sub('<[^>]*>', '', stex) if len(stex_nohtml) < 10: raise ContextException( 'Removing question number from para appears to have removed all text (this probably just means a footnote marker is using [], just change to ()).' ) self.TokenizePhraseRecurse(date, stex, 0)
def FilterWMSSpeakers(fout, text, sdate): stampurl = StampUrl(sdate) for fss in recomb.split(text): stampurl.UpdateStampUrl(fss) # speaker detection speakerg = respeakervals.match(fss) if speakerg: anamestamp = speakerg.group(1) or speakerg.group(2) or "" spstr = string.strip(speakerg.group(3)) spstrbrack = speakerg.group(4) if not spstr: continue try: #print "spstr", spstr, ",", spstrbrack result = memberList.matchwmsname(spstr, spstrbrack, sdate) except Exception, e: raise ContextException(str(e), stamp=stampurl, fragment=fss) # put record in thisplace spxm = '%s<speaker %s>%s</speaker>\n' % (anamestamp, result.encode("latin-1"), spstr) fout.write(spxm) continue # nothing detected # check if we've missed anything obvious if recomb.match(fss): raise ContextException('regexpvals not general enough', fragment=fss, stamp=stampurl) if remarginal.search(fss): raise ContextException(' marginal speaker detection case: %s' % remarginal.search(fss).group(0), fragment=fss, stamp=stampurl) fout.write(fss)
def NewGrabLordDivisionProced(qbp, qbd): if not re.match("speech|motion", qbp.typ) or len(qbp.stext) < 1: print qbp.stext raise ContextException("previous to division not speech", stamp=qbp.sstampurl) iskim = 1 while iskim <= len(qbp.stext) and not redivisionon.match( qbp.stext[-iskim]): iskim = iskim + 1 if iskim > len(qbp.stext): raise ContextException("Could not find Division 'title'", stamp=qbp.sstampurl) hdg = renewlorddiv.match(qbp.stext[-iskim + 1]) if not hdg: print qbp.stext[-iskim + 1] raise ContextException("no totals before division", stamp=qbp.sstampurl) # if previous thing is already a no-speaker, we don't need to break it out # (the coding on the question put is complex and multilined) if re.search('nospeaker="true"', qbp.speaker): qbp.stext = SubsPWtextset(qbp.stext) return None # copy the two lines into a non-speaking paragraph. qbdp = qspeech('nospeaker="true"', "", qbp.sstampurl) qbdp.typ = 'speech' qbdp.stext = SubsPWtextset(qbp.stext[-iskim:]) # trim back the given one by two lines qbp.stext = qbp.stext[:-iskim] return qbdp
def StripWransHeadings(headspeak, sdate): # check and strip the first two headings in as much as they are there i = 0 if (headspeak[i][0] != 'Initial') or headspeak[i][2]: print headspeak[0] raise ContextException('non-conforming Initial heading ') i += 1 # import pdb;pdb.set_trace() if (not re.match( '(?:<stamp aname="[^"]*"/>)*written answers?(?: to questions?)?(?i)', headspeak[i][0])) or headspeak[i][2]: if not re.match('The following answers were received.*', headspeak[i][0]): pass # print headspeak[i] else: i += 1 givendate = string.replace(headspeak[i][0], " ", " ") givendate = re.sub("</?i>", "", givendate) gd = re.match('(?:<stamp aname="[^"]*"/>)*(.*)$', givendate) if gd: givendate = gd.group(1) if (not re.match('(?i)(?:<stamp[^>]*>)*(?:<i>)?\s*(?:The following answers were|Answers) received.*', headspeak[i][0]) and not re.match('(?:<stamp[^>]*>)?The following question was answered on.*', headspeak[i][0]) and \ (sdate != mx.DateTime.DateTimeFrom(givendate).date)) or headspeak[i][2]: if (not parlPhrases.wransmajorheadings.has_key( headspeak[i][0])) or headspeak[i][2]: print headspeak[i] raise ContextException('non-conforming second heading', stamp=None, fragment=headspeak[i][0]) else: i += 1 # find the url and colnum stamps that occur before anything else stampurl = StampUrl(sdate) for j in range(0, i): stampurl.UpdateStampUrl(headspeak[j][0]) stampurl.UpdateStampUrl(headspeak[j][1]) # Later editions seem to miss first column number, sigh if not stampurl.stamp: for speeches in headspeak: text = ''.join([speech[1] for speech in speeches[2]]) m = re.search('colnum="(\d+)W"', text) if m: stampurl.UpdateStampUrl('<stamp coldate="%s" colnum="%dW"/>' % (sdate, int(m.group(1)) - 1)) break if not stampurl.stamp or not stampurl.pageurl or not stampurl.aname: raise ContextException('missing stamp url at beginning of file') return (i, stampurl)
def EndHeading(self, nextheading, nextmajor=None): self.EndSpeech() if (self.heading == 'Initial') and self.shspeak: print 'Speeches without heading' # lost heading signals are found elswhere? # concatenate unspoken text with the title if it's a dangle outside heading # e.g. In 2003-01-15 we have heading "Birmingham Northern Relief Road " # with extra bit "(Low-noise Tarmac)" to pull in. if not re.match('(?:<[^>]*?>|\s)*$', self.unspoketext): # We deliberately don't put "." in to avoid matching "19." before paragraph starts gho = re.match( '(\s*[()A-Za-z\-,\'\"/&#; 0-9]+)((?:<[^>]*?>|\s)*)$', self.unspoketext) if gho and not renotheadingmarg.search(self.unspoketext): self.heading = self.heading + ' ' + gho.group(1) self.heading = re.sub("\s+", " ", self.heading) #self.unspoketext = gho.group(2) # print "merged dangling heading %s" % (self.heading) if len(self.heading) > 100: raise ContextException( "Suspiciously long merged heading part - is it OK? %s" % self.heading, stamp=None, fragment=self.heading) self.shtext.append( (self.heading, self.unspoketext, self.shspeak, self.major)) self.heading = nextheading self.major = nextmajor self.unspoketext = '' # for holding colstamps self.shspeak = []
def MatchRevName(self, fss, sdate, stampurl): assert fss lfn = re.match( '(.*?)(?: of (.*?))?, ? ?((?:L|B|Abp|Bp|V|E|D|M|C|Ly)\.?)$', fss) if not lfn: print "$$$%s$$$" % fss raise ContextException("No match of format in MatchRevName", stamp=stampurl, fragment=fss) shorttitle = lfn.group(3) if shorttitle[-1] != '.': shorttitle += "." ltitle = titleconv[shorttitle] llordname = string.replace(lfn.group(1), ".", "") llordname = string.replace(llordname, "'", "'") llordname = re.sub("^De ", "de ", llordname) fullname = '%s %s' % (ltitle, llordname) llordofname = "" if lfn.group(2): llordofname = string.replace(lfn.group(2), ".", "") fullname = '%s of %s' % (fullname, llordofname) if fullname in self.aliases: return self.aliases[fullname] return self.GetLordID(ltitle, llordname, llordofname, "", stampurl, sdate, True)
def GrabWestminDivisionInterruptProced(qbp, rawtext): if len(qbp.stext) < 3: return None iskip = 0 if re.search("italic.*?>on resuming&\S*</p>(?i)", qbp.stext[-1]): if not re.search( "italic.*?>(?:sitting )?(?:suspended|adjourned)(?: for (?:a division|divisions) in the house)?[\.\s]*(?i)", qbp.stext[-2]): raise ContextException( 'failed to detect sitting suspended interruption', fragment=qbp.stext[-2]) iskip = -2 elif re.search("italic.*?>sitting suspended(?: for| until| till|\.)(?i)", qbp.stext[-1]): iskip = -1 # copy the lines into a non-speaking paragraph. if iskip: dumtext = re.sub( '<p>(?:<stamp aname="[^"]*?"/>)?<i>sitting suspended.*(?si)', '', rawtext) # Why didn't I make a note of why I did the following lines? Must be something to do with the timestamps... s = copy.copy(qbp.sstampurl) qbdp = qspeech('nospeaker="true"', dumtext, s) qbdp = qspeech('nospeaker="true"', "", s) qbdp.typ = 'speech' qbdp.stext = qbp.stext[iskip:] # trim back the given one by two lines qbp.stext = qbp.stext[:iskip] return qbdp return None
def GetLordIDfname(self, name, loffice, sdate, stampurl=None): name = re.sub("^The ", "", name) name = name.replace(' Of ', ' of ') if name in self.aliases: return self.aliases[name] if name == "Queen": return "uk.org.publicwhip/person/13935" hom = honcompl.match(name) if not hom: raise ContextException("lord name format failure on '%s'" % name, stamp=stampurl, fragment=name) # now we have a speaker, try and break it up ltit = hom.group(1) if not ltit: ltit = hom.group(2) lname = hom.group(3) else: lname = "" ltit = re.sub(" ", " ", ltit) lplace = "" if hom.group(4): lplace = re.sub(" ", " ", hom.group(4)) lplace = rehonorifics.sub("", lplace) lname = re.sub("^De ", "de ", lname) lname = rehonorifics.sub("", lname) return self.GetLordID(ltit, lname, lplace, loffice, stampurl, sdate, False)
def StripDebateHeading(hmatch, ih, headspeak, bopt=False): # print "StripDebateHeading", hmatch reheadmatch = '(?:<stamp aname="[^"]*"/>)*\s*' + hmatch if (not re.match(reheadmatch, headspeak[ih][0])) or headspeak[ih][2]: if bopt: return ih print "headspeak", headspeak[ih][:2] if headspeak[ih][2]: raise ContextException( 'non-conforming section after "%s" heading. FOR EXAMPLE: "in the chair" missing <h4><center> ' % hmatch, fragment=headspeak[ih][0]) print reheadmatch print headspeak[ih][2] raise ContextException('non-conforming "%s" heading ' % hmatch, fragment=headspeak[ih][0]) return ih + 1
def FixHTMLEntities(stex, signore='', stampurl=None): res = string.join(FixHTMLEntitiesL(stex, signore, stampurl), '') try: res = res.decode('utf-8') return res.encode("latin-1") except Exception, e: print "Encoding problem with:", res raise ContextException(str(e), stamp=stampurl, fragment=res)
def StripDebateHeading(hmatch, ih, headspeak, bopt=False): reheadmatch = '(?:<stamp aname="[^"]*"/>)*' + hmatch if (not re.match(reheadmatch, headspeak[ih][0])) or headspeak[ih][2]: if bopt: return ih print "\n", headspeak[ih] raise ContextException('non-conforming "%s" heading ' % hmatch) return ih + 1
def RunVotesFilters(fout, text, sdate, sdatever): (s, env, result) = parsevotetext(text, sdate) if result.success: result.delta.apply(None).writexml(fout, encoding="ISO-8859-1") # WriteXMLHeader(fout) # fout.write(result.text()) else: raise ContextException("Failed to parse vote\n%s\n%s" % (result, s[:128]))
def MpTellerList(fsm, vote, stampurl, sdate): res = [] for fss in fsm: if fss == '</b>': continue # The end </b> on Tellers for the (Ayes|Noes): if fss == '<b> and</b>': continue # The 'and' now gets a paragraph of its own while fss: # split by lines, but linefeed sometimes missing gftell = re.match( '\s*(?:and )?([ \w.\-\'&#;]*?)(?:\(([ \w.\-\'&#;]*)\))?(?: and(.*))?\s*\.?\s*$', fss) if not gftell: raise ContextException("no match on teller line", stamp=stampurl, fragment=fss) fssf = gftell.group(1) fssfcons = gftell.group(2) fss = gftell.group(3) if len(res) >= 2: print fsm raise ContextException(' too many tellers ', stamp=stampurl, fragment=fss) # It always is if fssf == 'Mr. Michael Foster': fssfcons = 'Worcester' (mpid, remadename, remadecons) = memberList.matchfullnamecons( fssf.strip(), fssfcons, sdate) #print fssf, " ++> ", remadename.encode("latin-1") if not mpid: raise ContextException("teller name bad match", stamp=stampurl, fragment=fssf) res.append( '\t<mpname person_id="%s" vote="%s" teller="yes">%s</mpname>' % (mpid, vote, FixHTMLEntities(fssf))) return res
def GrabLordDivisionProced(qbp, qbd): if not re.match("speech|motion", qbp.typ) or len(qbp.stext) < 1: print qbp.stext raise ContextException("previous to division not speech", stamp=qbp.sstampurl) hdg = relorddiv.match(qbp.stext[-1]) if not hdg: print qbp.stext[-1] raise ContextException("no lordships divided before division", stamp=qbp.sstampurl) # if previous thing is already a no-speaker, we don't need to break it out # (the coding on the question put is complex and multilined) if re.search('nospeaker="true"', qbp.speaker): qbp.stext = SubsPWtextset(qbp.stext) return None # look back at previous paragraphs and skim off a part of what's there # to make a non-spoken bit reporting on the division. iskim = 1 if not resaidamend.match(qbp.stext[-2]): print qbp.stext[-2] raise ContextException("no on said amendment", stamp=qbp.sstampurl, fragment=qbp.stext[-2]) iskim = 2 # copy the two lines into a non-speaking paragraph. qbdp = qspeech('nospeaker="true"', "", qbp.sstampurl) qbdp.typ = 'speech' qbdp.stext = SubsPWtextset(qbp.stext[-iskim:]) # trim back the given one by two lines qbp.stext = qbp.stext[:-iskim] return qbdp
def LordsDivisionParsingPart(divno, unspoketxt, stampurl, sdate): # find the ending of the division and split it off. gquesacc = re.search(regenddiv, unspoketxt) if gquesacc: divtext = unspoketxt[:gquesacc.start(1)] unspoketxt = unspoketxt[gquesacc.start(1):] unspoketxt = re.sub(':ENDDIVISION:', '', unspoketxt) elif sdate > '2008-12-01': # Sigh XXX m = re.match('.*, [A-Z]\.</p>(?s)', unspoketxt) if not m: m = re.match('.*<br>(?s)', unspoketxt) divtext = m.group() unspoketxt = unspoketxt[m.end():] else: divtext = unspoketxt print "division missing %s" % regenddiv print unspoketxt print "is there a linefeed before the </center> on the CONTENTS?" raise ContextException("Division missing resolved in the", stamp=stampurl, fragment="Division") # newly added unspoketxt = '' divtext = re.sub(' style="margin-bottom:[^"]*"', '', divtext) # Add a division object (will contain votes and motion text) spattr = 'nospeaker="true" divdate="%s" divnumber="%s"' % (sdate, divno) qbd = qspeech(spattr, divtext, stampurl) qbd.typ = 'division' # this type field seems easiest way if not stampurl.timestamp: raise ContextException("Division missing any timestamps; need to put one in to make it consistent. like <h5>2.44 pm</h5>", stamp=stampurl, fragment="Division") # filtering divisions here because we may need more sophisticated detection # of end of division than the "Question accordingly" marker. qbd.stext = LordsFilterDivision(qbd.text, stampurl, sdate) return (unspoketxt, qbd)
def SplitParaIndents(text, stampurl): dell = SplitParaSpace(text, stampurl) #print "dell", dell res = [] resdent = [] bIndent = 0 for i in range(len(dell)): if (i % 2) == 0: for sp in dell[i]: if re.match('(?:<ul><ul>)?<ul>(?i)', sp): if bIndent == 1: print dell[i - 1:i + 1] raise ContextException(' already indented ', stamp=stampurl, fragment=sp) bIndent = 1 elif re.match('(?:</ul></ul>)?</ul>(?i)', sp): # no error #if not bIndent: # raise Exception, ' already not-indentented ' bIndent = 0 elif re.match('<p style="margin-left: ?[23]0px;">', sp): bIndent = 2 elif bIndent == 2 and re.match('</p>', sp): bIndent = 0 continue # we have the actual text between the spaces # we might have full italics indent style # (we're ignoring fonts for now) # separate out italics type paragraphs tex = dell[i] cindent = bIndent > 0 and 1 or 0 qitbod = re.match('<i>([\s\S]*?)</i>[.:]?$', tex) if qitbod: tex = qitbod.group(1) cindent = cindent + 2 res.append(tex) resdent.append(cindent) #if bIndent: # print text # raise ' still indented after last space ' return (res, resdent)
def ParseRow(srow, hdcode, stampur): # build up the list of entries for this row Lscols = ['\t\t<tr> '] for spcol in recolsplit.split(srow): col = recolmatch.match(spcol) if col: colspan = '' rowspan = '' if col.group(2): colspan = ' colspan="%s"' % col.group(2) if col.group(5): colspan = ' colspan="%s"' % col.group(5) if col.group(3): rowspan = ' rowspan="%s"' % col.group(3) talign = '' if col.group(1): talign = ' align="%s"' % col.group(1) if col.group(4): talign = ' align="%s"' % col.group(4) Lscols.append('<%s%s%s%s>' % (hdcode, colspan, rowspan, talign)) coltext = re.sub('\n', ' ', col.group(6)) coltext = re.sub( '</?font[^>]*>|</?p[^>]*>|</?center>|</?B>|</?ul>(?i)', '', coltext) coltext = re.sub('^(?:<br>|\s)(?i)', '', coltext) coltext = re.sub('(?:<br>|\s)$(?i)', '', coltext) content = FixHTMLEntitiesL(coltext, '', stampurl=stampur) Lscols.extend(content) Lscols.append('</%s> ' % hdcode) # check that the outside text contains nothing but bogus close column tags elif not re.match('(?:</t[dh]>|</font>|\s)*$(?i)', spcol): print "spcol:", spcol print "srow:", srow print "srowsplit:", recolsplit.split(srow) raise ContextException("non column text", stamp=stampur, fragment=srow) Lscols.append('</tr>') return string.join(Lscols, '')
def ExtractQnum(tex, stampurl): qn = re.match('(.*?)\s*\[?((?:HL)?\d+R?)\]$', tex) if not qn: return ( tex, '0' ) # default when no qnum is found. the 0 qnums are detected elswhere (should have used "0error") in MeasureBlockSimilarity for gidmatching text = qn.group(1) isqn = re.search('\[((?:HL)?(\d+)R?)\]', text) if isqn: nqn = string.atoi(isqn.group(2)) if text.find("<ok-extra-qnum>") >= 0: text = text.replace("<ok-extra-qnum>", "", 1) elif nqn >= 1980 and nqn <= 2020: pass else: print tex print 'A colnum may be removing a necessary <p> tag before the (2)' raise ContextException('qnum in middle of index block', stamp=stampurl, fragment=isqn.group(1)) return (text, qn.group(2))
def FilterLordsColtime(fout, text, sdate): colnum = -1 time = '' stampurl = StampUrl(sdate) previoustime = [] for fss in recomb.split(text): # column number type # we need some very elaboirate checking to sort out the sections, by # titles that are sometimes on the wrong side of the first column, # and by colnums that miss the GC code in that section. # column numbers are also missed during divisions, and this exception # should be detected and noted. # That implies that this is the filter which detects the boundaries # between the standard four sections. columng = recolumnumvals.match(fss) if columng: # check date ldate = mx.DateTime.DateTimeFrom(columng.group(1)).date if sdate != ldate: raise ContextException("Column date disagrees %s -- %s" % (sdate, fss), stamp=stampurl, fragment=fss) # check number # ltype = columng.group(2) lcolnum = string.atoi(columng.group(3)) if lcolnum == colnum - 1: pass # spurious decrementing of column number stamps elif lcolnum == colnum: pass # spurious repeat of column number stamps # good (we get skipped columns in divisions) elif (colnum == -1) or (colnum + 1 <= lcolnum <= colnum + 5): # was 2 but this caused us to miss ones colnum = lcolnum fout.write('<stamp coldate="%s" colnum="%s%s"/>' % (sdate, colnum, "")) # column numbers do get skipped during division listings else: pass #print "Colnum not incrementing %d -- %d -- %s" % (colnum, lcolnum, fss) #raise Exception, "Colnum not incrementing %d -- %d -- %s" % (colnum, lcolnum, fss) #print (ldate, colnum, lindexstyle) continue timeg = retimevals.match(fss) if timeg: time = timeg.group(1) if not re.match('(?:</h5>|</st>)(?i)', time): time = TimeProcessing(time, previoustime, False, stampurl) fout.write('<stamp time="%s"/>' % time) if time: previoustime.append(time) continue # special lift a time out of the heading regtime3 = regtime3vals.match(fss) if regtime3: fout.write(fss) # put this heading back into the flow of text assert not previoustime lntimematch = re.match("(half[\- ]past )?(\w+)(-thirty)?$", regtime3.group(1)) lnhour = lntimematch and lntimematch.group(2) # strange way to do it, but I'm keeping tab on examples, and the transition between am and pm if lnhour == "two": lntimep = "2:%s pm" elif lnhour == "three": lntimep = "3:%s pm" elif lnhour == "six": lntimep = "6:%s pm" elif lnhour == "nine": lntimep = "9:%s am" elif lnhour == "eleven": lntimep = "11:%s am" elif lnhour == "ten": lntimep = "10:%s am" else: print "-------------'%s'" % regtime3.group(1) assert False assert not lntimematch.group(1) or not lntimematch.group(3) ntime = lntimep % ((lntimematch.group(1) or lntimematch.group(3)) and "30" or "00") time = TimeProcessing(ntime, previoustime, False, stampurl) fout.write('<stamp time="%s"/>' % time) continue # anchor names from HTML <a name="xxx"> anameg = reanamevals.match(fss) if anameg: aname = anameg.group(1) fout.write('<stamp aname="%s"/>' % aname) stampurl.aname = aname continue # nothing detected # check if we've missed anything obvious if recomb.match(fss): print "$$$", fss, "$$-$" raise ContextException(' regexpvals not general enough ', stamp=stampurl, fragment=fss) # a programming error between splitting and matching if remarginal.search(fss): print remarginal.search(fss).group(0) lregcolumnum6 = '<p>\s*</ul>\s*<a name="column_\d+">(?:</a>)?\s*<b>[^:<]*:\s*column\s*\d+\s*</b></p>\s*<ul><font size=3>(?i)' print re.findall(lregcolumnum6, fss) #print fss raise ContextException(' marginal coltime detection case ', stamp=stampurl, fragment=fss) fout.write(fss)
def parse_day(self, input): self.heading = {} self.pre_heading = {} self.speaker = {} self.text = '' timestamp = '' j = json.loads(input) if 'AllHansardComponentsList' in j: j = j['AllHansardComponentsList']['HansardComponent'] for line in j: text = (line['ComponentText'] or '').replace('&', '&') if not text: print "WARNING: Empty line: %s" % line elif line['ComponentType'] == 'Document Title': assert re.match( '(Plenary|PLE), %s/%s/%s$' % (self.date[8:10], self.date[5:7], self.date[0:4]), text) elif line['ComponentType'] == 'Time': timestamp = self.time_period(text) elif line['ComponentType'] == 'Header': if line['ComponentHeaderId'] in (0, 1, '0', '1'): typ = 'major' elif line['ComponentHeaderId'] in (2, '2'): typ = 'minor' else: raise Exception("Unknown ComponentHeaderId %s" % line['ComponentHeaderId']) if self.heading and self.heading['type'] == typ: self.pre_heading = { 'level': line['ComponentHeaderId'], 'text': self.heading['text'] } self.heading['text'] += ' — %s' % text else: self.display_speech() self.speaker = {'ts': timestamp} if self.pre_heading and self.pre_heading['level'] == line[ 'ComponentHeaderId']: text = '%s — %s' % (self.pre_heading['text'], text) elif self.pre_heading and self.pre_heading['level'] > line[ 'ComponentHeaderId']: self.pre_heading = {} self.heading = {'text': text, 'ts': timestamp, 'type': typ} elif re.match( 'Speaker \((MlaName|DeputyChairAndName|ChairAndName|DeputySpeaker|PrincipalDeputySpeaker|MinisterAndName|ActingSpeaker|TemporarySpeaker|Speaker)\)$', line['ComponentType']): # RelatedItemId here is the NI speaker ID. We could use that! # But for now, carry on going by name as all that code exists. self.display_speech() speaker = text.replace(':', '') id, stri = memberList.match(speaker, self.date) self.speaker = {'id': stri, 'ts': timestamp} elif line['ComponentType'] == 'Speaker (Special)' or line[ 'ComponentType'] == 'Speaker (GuestSpeaker)': self.display_speech() speaker = text.replace(':', '') self.speaker = {'name': speaker, 'ts': timestamp} elif line['ComponentType'] == 'Question': self.display_speech() m = re.match('(T?[0-9]+\. )?(.*?) asked', text) id, stri = memberList.match(m.group(2), self.date) self.speaker = {'id': stri, 'ts': timestamp} self.text += "<p>%s</p>\n" % text elif line['ComponentType'] == 'Quote': self.text += '<p class="indent">%s</p>\n' % text elif line['ComponentType'] in ('Plenary Item Text', 'Procedure Line'): match = re.match( 'The Assembly met at ((\d\d?):(\d\d?) (am|pm)|12 noon)', text) if match: timestamp = self.time_period(text) self.speaker['ts'] = timestamp self.text += '<p class="italic">%s</p>\n' % text elif line['ComponentType'] == 'Bill Text': self.text += text.replace( '<p>', '<p class="indent">') # Already is HTML elif line['ComponentType'] in ('Division', 'Spoken Text'): text = re.sub('\s*<BR />\s*<BR />\s*(?i)', '</p>\n<p>', text) text = re.sub('WIDTH=50%', 'WIDTH="50%"', text) self.text += '<p>%s</p>\n' % text else: raise ContextException("Uncaught Component Type! %s" % line['ComponentType']) self.display_speech()
def StripDebateHeadings(headspeak, sdate): # check and strip the first two headings in as much as they are there ih = 0 ih = StripDebateHeading( 'Initial', ih, headspeak ) # the 'Initial' is inserted by the splitheadingsspeakers function # volume type heading if re.search('THE$', headspeak[ih][0]): ih = StripDebateHeading('THE', ih, headspeak) ih = StripDebateHeading('PARLIAMENTARY(?: )+DEBATES', ih, headspeak) elif re.search('THE PARLIAMENTARY DEBATES', headspeak[ih][0]): ih = StripDebateHeading('THE PARLIAMENTARY DEBATES', ih, headspeak) if re.search('OFFICIAL REPORT', headspeak[ih][0]): ih = StripDebateHeading('OFFICIAL REPORT', ih, headspeak) ih = StripDebateHeading( 'IN THE .*? SESSION OF THE .*? PARLIAMENT OF THE', ih, headspeak, True) ih = StripDebateHeading( 'UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND', ih, headspeak, True) ih = StripDebateHeading('\[WHICH OPENED .*?\]', ih, headspeak, True) ih = StripDebateHeading('.*? YEAR OF THE REIGN OF.*?', ih, headspeak, True) ih = StripDebateHeading('HER MAJESTY QUEEN ELIZABETH II', ih, headspeak, True) ih = StripDebateHeading('SI.*? SERIES.*?VOLUME \d+', ih, headspeak, True) ih = StripDebateHeading('SI.*? SERIES', ih, headspeak, True) ih = StripDebateHeading('VOLUME \d+', ih, headspeak, True) ih = StripDebateHeading('.*? VOLUME OF SESSION .*?', ih, headspeak) #House of Commons ih = StripDebateHeading('house of commons(?i)', ih, headspeak) # Tuesday 9 December 2003 if not re.match('the house met at .*(?i)', headspeak[ih][0]): givendate = re.sub(' ', ' ', headspeak[ih][0]) givendate = re.sub('</?i>', ' ', givendate) gd = re.match('(?:<stamp aname="[^"]*"/>)*(.*)$(?i)', givendate) if gd: givendate = gd.group(1) if ((sdate != mx.DateTime.DateTimeFrom(givendate).date)) or headspeak[ih][2]: raise Exception, 'date heading %s mismatches with date %s' % (repr( headspeak[ih]), sdate) ih = ih + 1 gstarttime = None if sdate != "2001-06-13": #The House met at half-past Ten o'clock gstarttime = re.match( '(?:<stamp aname="[^"]*"/>)*(?:<i>)?\s*the\s+house (?:being |having )?met at?\s+(?:</i><i>\s*)?(.*?)(?:, and the Speaker-Elect having taken the Chair;)?(?:</i>)?$(?i)', headspeak[ih][0]) if (not gstarttime) or headspeak[ih][2]: raise ContextException( 'non-conforming "the house met at" heading %s' % repr(headspeak[ih]), "") ih = ih + 1 # Start of a new parliament is special if sdate not in ["2001-06-14", "2001-06-13", "2005-05-11", "2005-05-12"]: #PRAYERS ih = StripDebateHeading('prayers(?i)', ih, headspeak, True) ih = StripDebateHeading('pursuant to the Standing Order\.', ih, headspeak, True) # in the chair ih = StripDebateHeading('\[.*?[ >]in the chair[<>i/\.]*\](?i)', ih, headspeak, True) # find the url, colnum and time stamps that occur before anything else in the unspoken text stampurl = StampUrl(sdate) # set the time from the wording 'house met at' thing. if gstarttime: time = gstarttime.group(1) time = re.sub('</?i>', ' ', time) time = re.sub('\s+', ' ', time) if re.match("half-past Nine(?i)", time): newtime = '09:30:00' elif re.match("a quarter to Ten o(?i)", time): newtime = '09:45:00' elif re.match("Ten o'clock(?i)", time): newtime = '10:00:00' elif re.match("half-past Ten(?i)", time): newtime = '10:30:00' elif re.match("Eleven o'clock(?i)", time): newtime = '11:00:00' elif re.match("twenty-five minutes past\s*Eleven(?i)", time): newtime = '11:25:00' elif re.match("twenty-six minutes past\s*Eleven(?i)", time): newtime = '11:26:00' elif re.match("twenty-nine minutes past\s*Eleven(?i)", time): newtime = '11:29:00' elif re.match("half-past Eleven(?i)", time): newtime = '11:30:00' elif re.match("Twelve noon(?i)", time): newtime = '12:00:00' elif re.match("half-past One(?i)", time): newtime = '13:30:00' elif re.match("half-past Two(?i)", time): newtime = '14:30:00' elif re.match("twenty minutes to Three(?i)", time): newtime = '14:40:00' elif re.match("10 minutes past Three(?i)", time): newtime = '15:10:00' elif re.match("Six o'clock(?i)", time): newtime = '18:00:00' else: raise ContextException, "Start time not known: " + time stampurl.timestamp = '<stamp time="%s"/>' % newtime for j in range(0, ih): stampurl.UpdateStampUrl(headspeak[j][1]) if (not stampurl.stamp) or (not stampurl.pageurl): raise Exception, ' missing stamp url at beginning of file ' return (ih, stampurl)
def SplitLordsText(text, sdate): res = [ '', '', '', '' ] # Use a name tags wagc = re.search('(?:<br> <br>\s*)?<a name\s*=\s*"(?:gc|column_(?:GC|CWH)\d+|[0-9\-]+_cmtee0)">(?:</a>)?(?i)', text) wams = re.search('(?:<br> <br>\s*)?<a name="(?:wms|column_WS\d+)">(?:</a>)?(?i)', text) wama = re.search('(?:<br> <br>\s*)?<a name="(?:column_WA\d+|[\dw]*_writ0)">(?:</a>)?(?i)', text) # the sections are always in the same order, but sometimes there's one missing. # set end of house of lords section and check order if wagc: holend = wagc.start(0) if wams: assert holend < wams.start(0) elif wama: assert holend < wama.start(0) elif wams: holend = wams.start(0) if wama: assert holend < wama.start(0) elif wama: holend = wama.start(0) else: holend = len(text) # set the grand committee end res[0] = text[:holend] if wagc: if wams: gcend = wams.start(0) elif wama: gcend = wama.start(0) else: gcend = len(text) res[1] = text[holend:gcend] else: gcend = holend # set the ministerial statements end if wams: if wama: msend = wama.start(0) else: msend = len(text) res[2] = text[gcend:msend] else: msend = gcend # set the written answers end maend = len(text) if wama: res[3] = text[msend:] # lords splitting if IsNotQuiet(): print "Lords splitting into parts of size: ", map(len, res) # check the wrong column numbering or wrong titles aren't found in the wrong place assert res[0] # there always is a main debate chns = re.search('<a name="column_\D+\d+">', res[0]) if chns: print chns.group(0) raise ContextException("wrong column numbering in main debate", fragment=chns.group(0)) # check that there is always an adjournment in the main debate, with some of the trash that gets put before it # this kind of overguessing is to get a feel for the variation that is encountered. if sdate not in ('2007-10-01', '2008-09-29', '2009-10-05', '2010-09-27', '2012-09-24', '2013-09-23') \ and not re.search('(?:<ul><ul><ul>|<ul><ul><p>|\s*(?:<ul>|<p>)?|<p>\s*<ul><ul>(?:<ul>)?)\s*(?:Parliament was prorogued|House adjourned |For the continuation of today\'s proceedings)(?i)', res[0]): raise ContextException("house adjourned failure", stamp=None, fragment=res[0][-100:]) page = re.findall('<page[^>]*>', res[0])[-1] if (re.match('(<page[^>]*>\s*)+$', res[0])): res[0] = '' # check the title of the Grand Committee if res[1]: res[1] = page + res[1] page = re.findall('<page[^>]*>', res[1])[-1] assert not re.search('<a name="column_(?!(?:GC|CWH))\D+\d+">', res[1]) if not re.search('<(?:h[23] align=)?"?center"?>(?:<a name="[^"]*">(?:</a>)?)?\s*(?:(?:Official Report of the )?(?:(?:the)?Northern Ireland Orders )?Grand Committee|Second Reading Committee)', res[1]): raise ContextException("grand committee title failure", stamp=None, fragment=res[1][:100]) # check the title is in the Written Statements section if res[2]: res[2] = page + res[2] page = re.findall('<page[^>]*>', res[2])[-1] assert not re.search('<a name="column_(?!WS)\D+\d+">', res[2]) assert re.search('center"?>(?:<a name="[^"]*">(?:</a>)?)?Written Statements?', res[2]) # check the title and column numbering in the written answers if res[3]: res[3] = page + res[3] assert not re.search('<a name="column_(?!WA)\D+\d+">', res[3]) if not re.search('<(?:h3 align=)?"?center"?>(?:<a name="[^"]*">(?:</a>)?)?\s*Written Answers?', res[3]): # sometimes the s is missing raise ContextException("missing written answer title", fragment=res[3]) return res
def CreateGIDs(gidpart, sdate, sdatever, flatb): pcolnum = "####" picolnum = -1 ncid = -1 colnumoffset = 0 # the missing gid numbers come previous to the gid they would have gone, to handle missing ones before the 0 # 0-1, 0-2, 0, 1, 2, 3-0, 3-1, 3, ... ncmissedgidrun = 0 ncmissedgid = 0 for qb in flatb: # construct the gid realcolnum = re.search('colnum="([^"]*)"', qb.sstampurl.stamp).group(1) # this updates any column number corrections that were appended on the end of the stamp for realcolnum in re.findall('parsemess-colnum="([^"]*)"', qb.sstampurl.stamp): pass # this is to do a mass change of column number when they've got out of sync with the GIDs # (normally due to Hansard's cm->vo transition) for colnumoffset in re.findall('parsemess-colnumoffset="([^"]*)"', qb.sstampurl.stamp): colnumoffset = string.atoi(colnumoffset) realcolnumbits = re.match('(\d+)([WS]*)$', realcolnum) irealcolnum = int(realcolnumbits.group(1)) colnumN = irealcolnum + colnumoffset colnum = str(colnumN) + realcolnumbits.group(2) qb.ignorenamemismatch = re.search('parsemess-ignorenamemismatch="yes"', qb.sstampurl.stamp) # this numbers the speech numbers in the column numbers if colnum != pcolnum: # check that the column numbers are increasing # this is essential if the gids are to be unique. icolnum = string.atoi(re.match('(\d+)[WS]*$', colnum).group(1)) if icolnum <= picolnum: print qb.sstampurl.stamp raise ContextException("non-increasing column numbers %s %d" % (colnum, picolnum), stamp=qb.sstampurl, fragment=colnum) picolnum = icolnum pcolnum = colnum ncid = 0 ncmissedgidrun = 0 ncmissedgid = 0 else: ncid += 1 # this executes the missing ncid numbering command bmissgid = False lsmissgid = re.findall('parsemess-missgid="([^"]*)"', qb.sstampurl.stamp) for missgid in lsmissgid: if ncid == string.atoi(missgid): bmissgid = True if bmissgid: ncmissedgidrun += 1 missedgidext = "-%d" % ncmissedgidrun else: ncmissedgidrun = 0 missedgidext = "" # this is our GID !!!! qb.shortGID = '%s.%s.%d%s' % (sdatever, colnum, ncid - ncmissedgid, missedgidext) qb.GID = 'uk.org.publicwhip/%s/%s%s' % (gidpart, sdate, qb.shortGID) if bmissgid: ncmissedgid += 1 # build the parallel set of GIDs for the paragraphs (in preparation for an upgrade) qb.stextptags = [ ' pid="%s/%d"' % (qb.shortGID, i + 1) for i in range(len(qb.stext)) ] # make a place to record the gidredirects which we obtain on the way through qb.gidredirect = []
def MatchPWmotionStuff(qb, ispeechstartp1): qpara = qb.stext[ispeechstartp1] if re.match( '<p>(?:\[|<i>)*(?:Amendments?|Motion),? ?.{0,60}?(?:by leave)?,? withdrawn\.?,?(?:\]|</i>)*</p>(?i)', qpara): return "withdrawn" #[<i>Amendments Nos. 131 and 132 not moved.</i>]</p> notmovedMatch = re.match( '<p[^>]*>(?:\[|<i>)+Amendments? .{0,80}?(not moved|had been withdrawn from the Marshalled List|had been retabled as(?:Nos?\.|[^<\.\]]){0,60})(?:\.|</i>|\])+</p>(?i)', qpara) if notmovedMatch: return "notmoved" if re.match('<p>Motion not moved\.</p>', qpara): return "notmoved" if re.match( '<p>\[(?:<i>)?The Sitting was suspended .{0,60}?(?:</i>)?\](?:</i>)?</p>(?i)', qpara): return "suspended" if re.match('<p>\[(?:<i>)?The House observed.{0,60}?(\]|\.|</i>)+</p>', qpara): return "misc" if re.match( '<p>\[(?:<i>)?The page and line refer(?:ences are)? to .{0,160}?</p>', qpara): return "misc" # Needed to avoid lords on 2012-07-03 thinking this is someone withdrawing Amendment 63. if re.match('<p>Amendment 63 has been withdrawn, so I turn now to', qpara): return None # Needed to avoid lords on 2012-07-04 thinking this is someone withdrawing an Amendment rather than discussing it. if re.match( '<p>My Amendment 148G has been withdrawn from the Marshalled List,', qpara): return None if re.match( '<p>.{0,10}?(?:Amendment.{0,50}?|by leave, )(?<!semi-)withdrawn', qpara): raise ContextException( "Marginal withdrawn (fragment looks like it might be a withdrawn amendment, \nbut earlier regexp didn't pick it up)", stamp=qb.sstampurl, fragment=qpara) if re.match('<p>\s*\[<i>', qpara): raise ContextException( "Marginal notmoved (fragment looks like it might be an amendment not moved, \nbut an earlier regexp didn't pick it up)", stamp=qb.sstampurl, fragment=qpara) if re.match( '(?i)<p>(?:<i>)?(?:Moved.? accordingly,? and,? )?(?:[Oo]n [Qq]uestion,? )?(?:[Oo]riginal )?(?:[Mm]otion|[Aa]mendment|[Ss]chedule)s?(?: No\. \d+| [A-Z])?(?:, as amended)?,? agreed to(?:\.|—)+(?: Commons amendments?)?(?:</i>)?</p>', qpara): return "agreedto" clauseAgreedMatch = re.match( '<p>(?:(?:Clause|Schedule)s? \d+[A-Z]*,?(?:, \d+[A-Z]*)?(?: (?:and|to) \d+[A-Z]*)?|Title|Motion)(?:, as amended,?)? ((?:dis)?agreed to|negatived)\.</p>', qpara) if clauseAgreedMatch: return clauseAgreedMatch.group( 1) == "agreed to" and "agreedto" or "negatived" clauseResolvedMatch = re.match( '<p>Resolved in the (negative|affirmative),? and (?:Motion(?: \w+)?|amendments?|the manuscript amendment|Clause \d+|Amendment .{5,60}?)(?:, as amended,)? (?:dis)?agreed to accordingly(?:\.?</p>|;)', qpara) if clauseResolvedMatch: return clauseResolvedMatch.group( 1) == "negative" and "disagreedto" or "agreedto" if re.match('<p>Remaining( clauses?| and| schedules?)+ agreed to\.</p>', qpara): return "agreedto" commonsAmendMatch = re.match( '<p>(?:On Question, )?(?:manuscript )?(?:Commons )?Amendments? .{0,60}?(dis)?agreed to(?: accordingly)?\.</p>(?i)', qpara) if commonsAmendMatch: return commonsAmendMatch.group(1) and "disagreedto" or "agreedto" if re.match('<p>On Question, (?:Clause|Motion) .{0,16}?agreed to\.</p>', qpara): return "agreedto" if re.match('<p>Amendment disagreed to accordingly\.</p>', qpara): return "disagreedto" if re.match('<p>On Motion, Question agreed to\.</p>', qpara): return "agreedto" if re.match('<p>(The )?Schedule agreed to\.</p>', qpara): return "agreedto" if re.match('<p>Moved, That the .{0,120}? be (agreed to|approved)\.', qpara): return "considered" if re.match('<p>On Question, Whether .{0,60}? be agreed to\.', qpara): return "considered" if re.match( '<p>The Commons amendments were considered and agreed to\.</p>', qpara): return "agreeto" if re.match( '<p>(?:The )?Bill (?:was )?returned (?:earlier )?(?:from|to) the Commons.{0,350}?\.</p>', qpara): return "bill" if re.match( '<p[^>]*>The Commons (?:(?:do not )?insist on .{0,160}? but propose|have made the following consequential|(?:dis)?agree (?:to|with)) .{0,260}?(?:\.|—)*</p>', qpara): return "bill" if re.match( '<p[^>]*>The Lords insist on .{0,160}? for the following reasons?(?:\.|—)+</p>', qpara): return "bill" if re.match( '<p[^>]*>(?:<i>)?House adjourned (?:at|during) .{0,60}?(?:</i>)?</p>(?i)', qpara): return "adjourned" if re.match( '<p>(?:House|Debate|Second [Rr]eading debate|(?:Further )?[Cc]onsideration of amendments on Report) resumed(?: on Clause \d+)?[\.:]', qpara): return "resumed" if re.match("<p>A message was brought from the Commons", qpara): return "message" if re.match('<p>\*?Their Lordships divided:', qpara): return "divided" # this is the tag that can be used to give better titles on the motion text. if re.match( '<p>(?:Clause|Schedule) (?:\d+[A-Z]* )?\[(?:<i>)?.*?(?:</i>)?\]:</p>', qpara): return "considered" if re.match('<p>On Question, Whether ', qpara): return "considered" if re.match('<p>(?:Brought|Returned)(?: earlier)? from the Commons', qpara): return "misc" if re.match('<p>House (?:again )?in Committee', qpara): return "misc" if re.match('<p>\[\s*The (?:deputy )?chairman (?i)', qpara): #print "CHAIRMAN thing:", qpara return "misc" if re.match('<p>(?:Bill )?[Rr]ead a third time', qpara): return "bill" if re.match('<p>An amendment \(privilege\) made\.', qpara): return "misc" if re.match('<p>Report received\.', qpara): return "misc" if re.match('<p>Report received\.', qpara): return "misc" if re.match('<p>:TITLE3:', qpara): return "title" # perhaps remove this keyword # XXX MPS 2007-07-05 Don't care about this #if re.match("<p>.{0,20}?The noble[^:]{0,60}? said:", qpara): # print re.match("(<p>The (?:noble(?: and (?:learned|gallant|right reverend))? (?:Lord|Baroness|Earl|Viscount|Countess|Duke)|right reverend Prelate|most reverend Primate) said:\s*)", qpara) # #rens = re.match("(<p>The (?:noble(?: and (?:learned|gallant|right reverend))? (?:Lord|Baroness|Earl|Viscount|Countess|Duke)|right reverend Prelate|most reverend Primate) said:\s*)", qb.stext[i]) # print "Unexpected Noble Lord Said; are we missing the start of his speech where he moves the amendment?" # print "False positives can be hidden by adding a space before the colon" # print 'You can kill erroneous titles that are amendments by using <p class="tabletext">' # raise ContextException("unexpected Noble Lord Said", stamp=qb.sstampurl, fragment=qpara) if re.match('<p>.{0,55}agreed to(?:\.| accordingly)', qpara): print "**********Marginal agreedto", qpara raise ContextException("Marginal agreed to", stamp=qb.sstampurl, fragment=qpara) return None
def NormalHeadingPart(headingtxt, stampurl, state, typ): # This is an attempt at major heading detection. # The main wrap code spots adjournment debates, and does its best with some procedural things # But it's pretty flawed Also, Oral questions heading is a super-major heading, # so doesn't fit into the scheme. # remove junk italic settings that appear in the today pages headingtxt = re.sub("</?(?:i|sup)>(?i)", "", headingtxt) # detect if this is a major heading and record it in the correct variable bmajorheading = False boralheading = False binsertedheading = False if re.search('-- lost heading --(?i)', headingtxt): binsertedheading = True # Oral question are really a major heading elif re.match("Oral Answers to Questions(?i)", headingtxt): boralheading = True # Check if there are any other spellings of "Oral Answers to Questions" with a loose match elif re.search('oral(?i)', headingtxt) and re.search('ques(?i)', headingtxt) and (not re.search(" Not ", headingtxt)) and \ (not re.search("electoral", headingtxt)) and \ stampurl.sdate not in ("2002-06-11", "2012-02-09"): # have a genuine title with Oral in it print headingtxt raise ContextException('Oral question match not precise enough', stamp=stampurl, fragment=headingtxt) # All upper case headings - UGH elif not re.search('[a-z]', headingtxt) and not re.match('[A-Z\d/]+[\d/][A-Z\d/]+$', headingtxt) and not \ ('remaining_private_bills' in state and re.search(' Bill$(?i)', headingtxt)): bmajorheading = True elif 'just_had_points_of_order' in state: bmajorheading = True del state['just_had_points_of_order'] # If this is labeled major, then it gets concatenated with the # subsequent major heading. It's kind of a procedural info about the # running of things, so fair to have it as a minor heading alone. elif re.match("\[.*? in\s*the\s*Chair\.?\]$(?i)", headingtxt): bmajorheading = False elif re.search("in\s*the\s*chair(?i)", headingtxt): print headingtxt raise ContextException('in the chair match not precise enough', stamp=stampurl, fragment=headingtxt) # Other major headings, marked by _head in their anchor tag elif re.search('"topichd_|"ordayhd_|"hd_|_head', stampurl.aname): bmajorheading = True # Wah if stampurl.sdate > '2006-05-07': if re.match( "(Private business|Business of the House|Orders of the day|Opposition Day|Deferred Division|Petition)(?i)", headingtxt): bmajorheading = True if re.match("Points? of Order(?i)", headingtxt): bmajorheading = True state['just_had_points_of_order'] = True if re.match("Remaining Private Members[^ ]* Bills(?i)", headingtxt): bmajorheading = True state['remaining_private_bills'] = True # we're not writing a block for division headings # write out block for headings headingtxtfx = FixHTMLEntities(headingtxt) try: assert not re.search( "[<>]", headingtxtfx), headingtxtfx # an assertion in gidmatching except AssertionError: raise ContextException('Tag found in heading text', stamp=stampurl, fragment=headingtxt) qb = qspeech('nospeaker="true"', headingtxtfx, stampurl) if typ == 'westminhall': qb.typ = 'minor-heading' elif binsertedheading: qb.typ = 'inserted-heading' elif boralheading: qb.typ = 'oral-heading' elif bmajorheading: qb.typ = 'major-heading' else: qb.typ = 'minor-heading' # headings become one unmarked paragraph of text qb.stext = [headingtxtfx] return qb
def FilterReply(qs): # split into paragraphs. The second results is a parallel array of bools (textp, textpindent) = SplitParaIndents(qs.text, qs.sstampurl) if not textp: raise Exception, ' no paragraphs in result ' # the resulting list of paragraphs stext = [] # index into the textp array as we consume it. i = 0 # deal with holding answer phrase at front # <i>[holding answer 17 September 2003]:</i> qholdinganswer = resqbrack.match(textp[0]) if qholdinganswer: pht = PhraseTokenize(qs, qholdinganswer.group(1)) stext.append(pht.GetPara('holdinganswer')) textp[i] = textp[i][qholdinganswer.span(0)[1]:] if not textp[i]: i += 1 # asked to reply qaskedtoreply = reaskedtoreply.match(textp[i]) if qaskedtoreply: pht = PhraseTokenize(qs, qaskedtoreply.group(0)) stext.append(pht.GetPara('askedtoreply')) textp[i] = textp[i][qaskedtoreply.span(0)[1]:] if not textp[i]: i = i+1 # go through the rest of the paragraphs while i < len(textp): # deal with tables if re.match('<table(?i)', textp[i]): if re.match('<table[^>]*>[\s\S]*?</table>$(?i)', textp[i]): stext.extend(ParseTable(textp[i], qs.sstampurl)) i += 1 continue else: print "textp[i]: ", textp[i] raise ContextException("table start with no end", stamp=qs.sstampurl, fragment=textp[i]) qletterinlibrary = reletterinlibrary.match(textp[i]) if qletterinlibrary: pht = PhraseTokenize(qs, qletterinlibrary.group(0)) stext.append(pht.GetPara('letterinlibrary')) textp[i] = textp[i][qletterinlibrary.span(0)[1]:] if not textp[i]: i += 1 continue # <i>Letter from Ruth Kelly to Mr. Frank Field dated 2 December 2003:</i> # introducing a previous letter from a civil servant to an MP # this should tokenize the pieces more qlettfrom = relettfrom.match(textp[i]) if qlettfrom: pht = PhraseTokenize(qs, qlettfrom.group(1)) stext.append(pht.GetPara('letterfrom')) i += 1 continue # nothing special about this paragraph (except it may be indented) pht = PhraseTokenize(qs, textp[i]) stext.append(pht.GetPara(pcode[textpindent[i]], bKillqnum=True)) i += 1 return stext
def FilterLordsSpeech(qb): # pull in the normal filtering that gets done on debate speeches # does the paragraph indents and tables. Maybe should be inlined for lords FilterDebateSpeech(qb) # the colon attr is blank or has a : depending on what was there after the name that was matched ispeechstartp1 = 0 # plus 1 # no colonattr or colon, must be making a speech recol = re.search('colon="(:?)"', qb.speaker) bSpeakerExists = not re.match('nospeaker="true"', qb.speaker) if bSpeakerExists and (not recol or recol.group(1)): # text of this kind at the beginning should not be spoken, assume there wasn't a colon if not re.search("<p>(?:moved|asked|rose to move,) (?i)", qb.stext[0]) or re.search("<p>moved formally(?i)", qb.stext[0]): ispeechstartp1 = 1 # 0th paragraph is speech text res = [] # output list preparagraphtype = "" if bSpeakerExists and (ispeechstartp1 == 0): if re.match( "<p>asked Her Majesty's Government|<p>asked the|<p>—Took the Oath", qb.stext[0]): preparagraphtype = "asked" ispeechstartp1 = SearchForNobleLordSaid(qb, preparagraphtype) if ispeechstartp1 == len( qb.stext): # No Noble Lord said, the usual ispeechstartp1 = 1 if ispeechstartp1 != 1: print "Noble Lord Said on ", ispeechstartp1, "paragraph" raise ContextException( "Noble Lord Said missing in second paragraph", stamp=qb.sstampurl) # ensure that the noble lord said doesn't say an amendment withdrawn assert not MatchPWmotionStuff(qb, ispeechstartp1) elif re.match("<p>rose to (?:ask|call|draw attention|consider)", qb.stext[0]): preparagraphtype = "asked" ispeechstartp1 = SearchForNobleLordSaid(qb, preparagraphtype) if ispeechstartp1 not in [1, 2]: print "Noble Lord Said on ", ispeechstartp1, "paragraph" raise ContextException( "Noble Lord Said missing in second paragraph", stamp=qb.sstampurl) # ensure that the noble lord said doesn't say an amendment withdrawn assert not MatchPWmotionStuff(qb, ispeechstartp1) # identify a writ of summons (single line) elif re.match( "<p>(?:[\s,]*having received a [Ww]rit of [Ss]ummons .*?)?[Tt]ook the [Oo]ath\.</p>$", qb.stext[0]): assert len(qb.stext) == 1 qb.stext[0] = re.sub( '^<p>', '<p pwmotiontext="summons">', qb.stext[0] ) # cludgy; already have the <p>-tag embedded in the string res.append(qb) return res # bail out elif re.search( "having been created.*?Was, in (his|her) robes, introduced", qb.stext[0]): assert len(qb.stext) == 1 qbunspo = qspeech('nospeaker="true"', "", qb.sstampurl) qbunspo.typ = 'speech' qbunspo.stext = qb.stext qbunspo.stext[0] = re.sub('^<p>', '<p pwmotiontext="introduced">', qbunspo.stext[0]) res.append(qbunspo) return res elif re.match("<p>—Took the Oath", qb.stext[0]): assert False # identify a moved amendment elif re.match( "<p>moved,? |<p>Amendments? |<p>had given notice|<p>(?:rose )?to move|<p>had given his intention", qb.stext[0]): # find where the speech begins, and strip out "The noble lord said:" preparagraphtype = "moved" ispeechstartp1 = SearchForNobleLordSaid(qb, preparagraphtype) # everything up to this point is non-speech assert ispeechstartp1 > 0 qbprev = qspeech(qb.speaker, "", qb.sstampurl) qbprev.typ = 'speech' qbprev.stext = qb.stext[:ispeechstartp1] res.append(qbprev) if ispeechstartp1 == len(qb.stext): return res # upgrade the spoken part qb.speaker = string.replace(qb.speaker, 'colon=""', 'colon=":"') del qb.stext[:ispeechstartp1] assert qb.stext ispeechstartp1 = 1 # the spoken text must reach at least here (after the line, "The noble lord said:") # error, no moved amendment found else: print qb.stext print "no moved amendment; is a colon missing after the name?" raise ContextException("missing moved amendment", stamp=qb.sstampurl) # advance to place where non-speeches happen if ispeechstartp1 > len(qb.stext): print "ispeechstartp1 problem; speeches running through", ispeechstartp1, len( qb.stext) print qb.stext raise ContextException( "end of speech boundary unclear running through; need to separate paragraphs?", stamp=qb.sstampurl) # a common end of speech is to withdraw an amendment # we go through paragraphs until we match that or some other motion text type statement sAmendmentStatement = None while bSpeakerExists and (ispeechstartp1 < len(qb.stext)): sAmendmentStatement = MatchPWmotionStuff(qb, ispeechstartp1) if sAmendmentStatement: break ispeechstartp1 += 1 # there are no further lines after the widthdrawal if ispeechstartp1 == len(qb.stext): assert not sAmendmentStatement res.append(qb) return res # do the further lines after withdrawal assert (not bSpeakerExists) or sAmendmentStatement # splice off the unspoken text running off from the amendment statements if ispeechstartp1 != 0: qbunspo = qspeech('nospeaker="true"', "", qb.sstampurl) qbunspo.typ = 'speech' qbunspo.stext = qb.stext[ispeechstartp1:] del qb.stext[ispeechstartp1:] res.append(qb) res.append(qbunspo) else: res.append(qb) qbunspo = qb # check that once we begin pwmotion amendment statements, all statements are of this type for i in range(len(qbunspo.stext)): if not re.match('<p', qbunspo.stext[i]): continue sAmendmentStatement = MatchKnownAsPWmotionStuff(qbunspo, i) if not sAmendmentStatement: if IsNotQuiet(): print "UNRECOGNIZED-MOTION-TEXT%s: %s" % ( bSpeakerExists and " " or "(*)", qbunspo.stext[i]) sAmendmentStatement = "unrecognized" qbunspo.stext[i] = re.sub( '^<p(.*?)>', '<p\\1 pwmotiontext="%s">' % sAmendmentStatement, qbunspo.stext[i]) return res
def LordsFilterDivision(text, stampurl, sdate): # the intention is to splice out the known parts of the division fs = re.split('\s*(?:<br>|</?p>)\s*(?i)', text) contentlords = [ ] notcontentlords = [ ] contstate = '' for fss in fs: if not fss: continue cfs = recontma.match(fss) if cfs: if cfs.group(1) == "CONTENTS": assert contstate == '' contstate = 'content' elif cfs.group(1) == 'NOT-CONTENTS' or cfs.group(1) == 'NOT CONTENTS': assert contstate == 'content' contstate = 'not-content' else: print "$$$%s$$$" % cfs.group(1) raise ContextException("unrecognised content state", stamp=stampurl, fragment=fss) elif re.match("(?:\[\*|\*\[)[Ss]ee col\. \d+\]", fss): print "Disregarding cross-reference in Division", fss elif re.match("\[\*\s*The Tellers.*?[Tt]he Clerks.*?\]", fss): print "Disregarding clerk comment on numbers", fss elif re.match("\[\*\s*The name of a .*? removed from the voting lists\.\]", fss): print "Disregarding removed from list comment", fss else: if not contstate: raise ContextException("empty contstate", stamp=stampurl, fragment=fss) # split off teller case teller = retellma.match(fss) tels = '' lfss = fss if teller: lfss = teller.group(1) tels = ' teller="yes"' # strip out the office offm = reoffma.match(lfss) if offm: lfss = offm.group(1) if not lfss: raise ContextException("no name on line", stamp=stampurl, fragment=fss) lordid = lordsList.MatchRevName(lfss, sdate, stampurl) lordw = '\t<lord person_id="%s" vote="%s"%s>%s</lord>' % (lordid, contstate, tels, FixHTMLEntities(fss)) if contstate == 'content': contentlords.append(lordw) else: notcontentlords.append(lordw) # now build up the return value stext = [ ] stext.append('<divisioncount content="%d" not-content="%d"/>' % (len(contentlords), len(notcontentlords))) stext.append('<lordlist vote="content">') stext.extend(contentlords) stext.append('</lordlist>') stext.append('<lordlist vote="not-content">') stext.extend(notcontentlords) stext.append('</lordlist>') return stext
def MatchKnownAsPWmotionStuff(qb, ispeechstartp1): res = MatchPWmotionStuff(qb, ispeechstartp1) if res: return res qpara = qb.stext[ispeechstartp1] #if re.match("<p>My Lords", qpara): # raise ContextException("My Lords in known amendment text", stamp=qb.sstampurl, fragment=qpara) if re.match("<p>.{0,60}? Act[\.,]?</p>", qpara): return "act" if re.match("<p[^>]*>\([d\w]+\) ", qpara): return "lines" if re.match("<p[^>]*>\( \) ", qpara): return "lines" if re.match("<p><phrase class=\"date\".*</phrase>\.</p>", qpara): return "date" if re.match("<p[^>]*>Sections? .{0,30}?</p>", qpara): return "lines" if re.match( "<p[^>]*>(?:Schedule \S+?|The Schedule)(?:, paragraph.{0,60}?)?</p>", qpara): return "lines" if re.match("<p[^>]*>\d+[A-Z]?\.? ", qpara): return "lines" if re.match("<p[^>]*>Page \d+, line \d+, ", qpara): return "lines" if re.match("<p[^>]*>"", qpara): return "quot" if re.match( "<p>[a-z]", qpara ): # starting with lower case letter, some kind of continuation return "quot" if re.match("<p[^>]*>—", qpara): return "lines" # insert an extra space because they tend to ram it together clpmatch = re.match( "(<p[^>]*>\d+[A-Z]?)((?:Clause|Line|Page|Schedule|Because|After|Insert) .*$)", qpara) if clpmatch: qb.stext[ispeechstartp1] = "%s %s" % (clpmatch.group(1), clpmatch.group(2)) return "lines" if re.match( "<p>The noble .{0,30}?(?:Lord|Baroness|Earl|Viscount|Countess|Duke) said", qpara): print "*****", qpara raise ContextException("unexpected weak Noble Lord Said", stamp=qb.sstampurl, fragment=qpara) if re.match("<p>The .{5,40}? \([^)]+\):", qpara): raise ContextException("unexpected person with position Said", stamp=qb.sstampurl, fragment=qpara) if re.match( "<p>(?:Lord|Baroness|Earl|Viscount|Countess|Duke) [\w\s].{5,40}?:", qpara): raise ContextException("unexpected person Said, (missing <b>?)", stamp=qb.sstampurl, fragment=qpara) if re.match( "<p>(?:Lord|Baroness|Earl|Viscount|Countess|Duke) [\w\s].{5,40}? moved ", qpara): raise ContextException("unexpected person moved, (missing <b>?)", stamp=qb.sstampurl, fragment=qpara) return None