def TokenStandingOrder(mstandingo, phrtok): if mstandingo.group(2): return ('phrase', ' class="standing-order" code="%s" title="%s"' % (FixHTMLEntities(mstandingo.group(1)), FixHTMLEntities(re.sub('<[^>]*>', '', mstandingo.group(2))))) return ('phrase', ' class="standing-order" code="%s"' % mstandingo.group(1))
def TokenizePhraseRecurse(self, qs, stex, itc): # end of the chain if itc == len(tokenchain): self.toklist.append( ('', '', FixHTMLEntities(stex, stampurl=(qs and qs.sstampurl)))) return # keep eating through the pieces for the same token while stex: # attempt to split the token mtoken = tokenchain[itc][1].search(stex) if mtoken: # the and/or method fails with this headtex = stex[:mtoken.span(0)[0]] else: headtex = stex # check for marginals if tokenchain[itc][2] and tokenchain[itc][2].search(headtex): pass #print "Marginal token match:", tokenchain[itc][0] #print tokenchain[itc][2].findall(headtex) #print headtex # send down the one or three pieces up the token chain if headtex: self.TokenizePhraseRecurse(qs, headtex, itc + 1) # no more left if not mtoken: break # break up the token if it is there tokpair = tokenchain[itc][3](mtoken, self) self.toklist.append((tokpair[0], tokpair[1], FixHTMLEntities(mtoken.group(0), stampurl=(qs and qs.sstampurl)))) #print "Token detected:", mtoken.group(0) # the tail part stex = stex[mtoken.span(0)[1]:]
def LordsHeadingPart(headingtxt, stampurl, major): headingtxtfx = FixHTMLEntities(headingtxt) qb = qspeech('nospeaker="true"', headingtxtfx, stampurl) if major and stampurl.sdate > '2008-12-01': qb.typ = 'major-heading' else: qb.typ = 'minor-heading' # headings become one unmarked paragraph of text qb.stext = [headingtxtfx] return qb
def TokenDate(ldate, phrtok): sdate_year = phrtok.sdate[0:4] tdate = ldate.group(0).replace(' ', ' ') noyear = False if not ldate.group(2): tdate += " %s" % sdate_year noyear = True try: lldate = mx.DateTime.DateTimeFrom(tdate) #if noyear and lldate > mx.DateTime.now(): # lldate = (lldate - mx.DateTime.RelativeDateTime(years=1)) ldate = lldate.date phrtok.lastdate = ldate except: phrtok.lastdate = '' return ('phrase', ' class="date" code="%s"' % FixHTMLEntities(phrtok.lastdate))
def MpTellerList(fsm, vote, stampurl, sdate): res = [] for fss in fsm: if fss == '</b>': continue # The end </b> on Tellers for the (Ayes|Noes): if fss == '<b> and</b>': continue # The 'and' now gets a paragraph of its own while fss: # split by lines, but linefeed sometimes missing gftell = re.match( '\s*(?:and )?([ \w.\-\'&#;]*?)(?:\(([ \w.\-\'&#;]*)\))?(?: and(.*))?\s*\.?\s*$', fss) if not gftell: raise ContextException("no match on teller line", stamp=stampurl, fragment=fss) fssf = gftell.group(1) fssfcons = gftell.group(2) fss = gftell.group(3) if len(res) >= 2: print fsm raise ContextException(' too many tellers ', stamp=stampurl, fragment=fss) # It always is if fssf == 'Mr. Michael Foster': fssfcons = 'Worcester' (mpid, remadename, remadecons) = memberList.matchfullnamecons( fssf.strip(), fssfcons, sdate) #print fssf, " ++> ", remadename.encode("latin-1") if not mpid: raise ContextException("teller name bad match", stamp=stampurl, fragment=fssf) res.append( '\t<mpname person_id="%s" vote="%s" teller="yes">%s</mpname>' % (mpid, vote, FixHTMLEntities(fssf))) return res
def LordsFilterDivision(text, stampurl, sdate): # the intention is to splice out the known parts of the division fs = re.split('\s*(?:<br>|</?p>)\s*(?i)', text) contentlords = [ ] notcontentlords = [ ] contstate = '' for fss in fs: if not fss: continue cfs = recontma.match(fss) if cfs: if cfs.group(1) == "CONTENTS": assert contstate == '' contstate = 'content' elif cfs.group(1) == 'NOT-CONTENTS' or cfs.group(1) == 'NOT CONTENTS': assert contstate == 'content' contstate = 'not-content' else: print "$$$%s$$$" % cfs.group(1) raise ContextException("unrecognised content state", stamp=stampurl, fragment=fss) elif re.match("(?:\[\*|\*\[)[Ss]ee col\. \d+\]", fss): print "Disregarding cross-reference in Division", fss elif re.match("\[\*\s*The Tellers.*?[Tt]he Clerks.*?\]", fss): print "Disregarding clerk comment on numbers", fss elif re.match("\[\*\s*The name of a .*? removed from the voting lists\.\]", fss): print "Disregarding removed from list comment", fss else: if not contstate: raise ContextException("empty contstate", stamp=stampurl, fragment=fss) # split off teller case teller = retellma.match(fss) tels = '' lfss = fss if teller: lfss = teller.group(1) tels = ' teller="yes"' # strip out the office offm = reoffma.match(lfss) if offm: lfss = offm.group(1) if not lfss: raise ContextException("no name on line", stamp=stampurl, fragment=fss) lordid = lordsList.MatchRevName(lfss, sdate, stampurl) lordw = '\t<lord person_id="%s" vote="%s"%s>%s</lord>' % (lordid, contstate, tels, FixHTMLEntities(fss)) if contstate == 'content': contentlords.append(lordw) else: notcontentlords.append(lordw) # now build up the return value stext = [ ] stext.append('<divisioncount content="%d" not-content="%d"/>' % (len(contentlords), len(notcontentlords))) stext.append('<lordlist vote="content">') stext.extend(contentlords) stext.append('</lordlist>') stext.append('<lordlist vote="not-content">') stext.extend(notcontentlords) stext.append('</lordlist>') return stext
def NormalHeadingPart(headingtxt, stampurl, state, typ): # This is an attempt at major heading detection. # The main wrap code spots adjournment debates, and does its best with some procedural things # But it's pretty flawed Also, Oral questions heading is a super-major heading, # so doesn't fit into the scheme. # remove junk italic settings that appear in the today pages headingtxt = re.sub("</?(?:i|sup)>(?i)", "", headingtxt) # detect if this is a major heading and record it in the correct variable bmajorheading = False boralheading = False binsertedheading = False if re.search('-- lost heading --(?i)', headingtxt): binsertedheading = True # Oral question are really a major heading elif re.match("Oral Answers to Questions(?i)", headingtxt): boralheading = True # Check if there are any other spellings of "Oral Answers to Questions" with a loose match elif re.search('oral(?i)', headingtxt) and re.search('ques(?i)', headingtxt) and (not re.search(" Not ", headingtxt)) and \ (not re.search("electoral", headingtxt)) and \ stampurl.sdate not in ("2002-06-11", "2012-02-09"): # have a genuine title with Oral in it print headingtxt raise ContextException('Oral question match not precise enough', stamp=stampurl, fragment=headingtxt) # All upper case headings - UGH elif not re.search('[a-z]', headingtxt) and not re.match('[A-Z\d/]+[\d/][A-Z\d/]+$', headingtxt) and not \ ('remaining_private_bills' in state and re.search(' Bill$(?i)', headingtxt)): bmajorheading = True elif 'just_had_points_of_order' in state: bmajorheading = True del state['just_had_points_of_order'] # If this is labeled major, then it gets concatenated with the # subsequent major heading. It's kind of a procedural info about the # running of things, so fair to have it as a minor heading alone. elif re.match("\[.*? in\s*the\s*Chair\.?\]$(?i)", headingtxt): bmajorheading = False elif re.search("in\s*the\s*chair(?i)", headingtxt): print headingtxt raise ContextException('in the chair match not precise enough', stamp=stampurl, fragment=headingtxt) # Other major headings, marked by _head in their anchor tag elif re.search('"topichd_|"ordayhd_|"hd_|_head', stampurl.aname): bmajorheading = True # Wah if stampurl.sdate > '2006-05-07': if re.match( "(Private business|Business of the House|Orders of the day|Opposition Day|Deferred Division|Petition)(?i)", headingtxt): bmajorheading = True if re.match("Points? of Order(?i)", headingtxt): bmajorheading = True state['just_had_points_of_order'] = True if re.match("Remaining Private Members[^ ]* Bills(?i)", headingtxt): bmajorheading = True state['remaining_private_bills'] = True # we're not writing a block for division headings # write out block for headings headingtxtfx = FixHTMLEntities(headingtxt) try: assert not re.search( "[<>]", headingtxtfx), headingtxtfx # an assertion in gidmatching except AssertionError: raise ContextException('Tag found in heading text', stamp=stampurl, fragment=headingtxt) qb = qspeech('nospeaker="true"', headingtxtfx, stampurl) if typ == 'westminhall': qb.typ = 'minor-heading' elif binsertedheading: qb.typ = 'inserted-heading' elif boralheading: qb.typ = 'oral-heading' elif bmajorheading: qb.typ = 'major-heading' else: qb.typ = 'minor-heading' # headings become one unmarked paragraph of text qb.stext = [headingtxtfx] return qb
def RunRegmemFilters(fout, text, sdate, sdatever): if sdate >= '2010-09-01': return RunRegmemFilters2010(fout, text, sdate, sdatever) # message for cron so I check I'm using this print "New register of members interests! Check it is working properly (via mpinfoin.pl) - %s" % sdate text = ApplyFixSubstitutions(text, sdate, fixsubs) WriteXMLHeader(fout) fout.write("<publicwhip>\n") text = re.sub('Rt Shaun', 'Shaun', text) # Always get his name wrong text = re.sub('€', '£', text) # Always get some pound signs wrong rows = re.findall("<TR>(.*)</TR>", text) rows = [re.sub(" ", " ", row) for row in rows] rows = [re.sub("<B>|</B>|<BR>|`", "", row) for row in rows] rows = [ re.sub('<span style="background-color: #FFFF00">|</span>', '', row) for row in rows ] rows = [re.sub('<IMG SRC="3lev.gif">', "", row) for row in rows] rows = [re.sub("­", "-", row) for row in rows] rows = [ re.sub('\[<A NAME="n\d+"><A HREF="\#note\d+">\d+</A>\]', '', row) for row in rows ] rows = [re.sub('\[<A NAME="n\d+">\d+\]', '', row) for row in rows] # Fix incorrect tabling of categories when highlighting is in play rows = [ re.sub('<TD COLSPAN=4>(\d\.) ([^<]*?)</TD>', r'<TD>\1</TD><TD COLSPAN=3>\2</TD>', row) for row in rows ] # split into cells within a row rows = [re.findall("<TD.*?>\s*(.*?)\s*</TD>", row) for row in rows] memberset = set() needmemberend = False category = None categoryname = None subcategory = None for row in rows: striprow = re.sub('</?[^>]+>', '', "".join(row)) #print row if striprow.strip() == "": # There is no text on the row, just tags pass elif len(row) == 1 and re.match("(?i)(<i>)? +(</i>)?", row[0]): # <TR><TD COLSPAN=4> </TD></TR> pass elif len(row) == 1: # <TR><TD COLSPAN=4><B>JACKSON, Robert (Wantage)</B></TD></TR> res = re.search("^([^,]*), ([^(]*) \((.*)\)$", row[0]) if not res: print row raise ContextException, "Failed to break up into first/last/cons: %s" % row[ 0] (lastname, firstname, constituency) = res.groups() constituency = constituency.replace(')', '') constituency = constituency.replace('(', '') firstname = memberList.striptitles(firstname)[0] # Register came out after they stood down if (firstname == 'Ian' and lastname == 'GIBSON' and sdate > '2009-06-08') \ or (firstname == 'Michael' and lastname == 'MARTIN' and sdate > '2009-06-22'): check_date = '2009-06-08' else: check_date = sdate (id, remadename, remadecons) = memberList.matchfullnamecons( firstname + " " + memberList.lowercaselastname(lastname), constituency, check_date) if not id: raise ContextException, "Failed to match name %s %s (%s) date %s" % ( firstname, lastname, constituency, sdate) if category: fout.write('\t</category>\n') if needmemberend: fout.write('</regmem>\n') needmemberend = False fout.write(('<regmem personid="%s" membername="%s" date="%s">\n' % (id, remadename, sdate)).encode("latin-1")) memberset.add(id) needmemberend = True category = None categoryname = None subcategory = None elif len(row) == 2 and row[0] == '' and re.match('Nil\.\.?', row[1]): # <TR><TD></TD><TD COLSPAN=3><B>Nil.</B></TD></TR> fout.write('Nil.\n') elif len(row) == 2 and row[0] != '': # <TR><TD><B>1.</B></TD><TD COLSPAN=3><B>Remunerated directorships</B></TD></TR> if category: fout.write('\t</category>\n') digits = row[0] category = re.match("\s*(\d\d?)\.$", digits).group(1) categoryname = row[1] subcategory = None fout.write('\t<category type="%s" name="%s">\n' % (category, categoryname)) elif len(row) == 2 and row[0] == '': # <TR><TD></TD><TD COLSPAN=3><B>Donations to the Office of the Leader of the Liberal Democrats received from:</B></TD></TR> if subcategory: fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, FixHTMLEntities(row[1]))) else: fout.write('\t\t<item>%s</item>\n' % FixHTMLEntities(row[1])) elif len(row) == 3 and row[0] == '' and row[1] == '': # <TR><TD></TD><TD></TD><TD COLSPAN=2>19 and 20 September 2002, two days fishing on the River Tay in Scotland as a guest of Scottish Coal. (Registered 3 October 2002)</TD></TR> if subcategory: fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, FixHTMLEntities(row[2]))) else: fout.write('\t\t<item>%s</item>\n' % FixHTMLEntities(row[2])) elif len(row) == 3 and row[0] == '': # <TR><TD></TD><TD><B>(a)</B></TD><TD COLSPAN=2>Smithville Associates; training consultancy.</TD></TR> if subcategory: fout.write( '\t\t<item subcategory="%s">%s</item>\n' % (subcategory, FixHTMLEntities(row[1] + ' ' + row[2]))) else: fout.write('\t\t<item>%s</item>\n' % FixHTMLEntities(row[1] + ' ' + row[2])) elif len(row) == 4 and row[0] == '' and (row[1] == '' or row[1] == '<IMG SRC="3lev.gif">'): # <TR><TD></TD><TD></TD><TD>(b)</TD><TD>Great Portland Estates PLC</TD></TR> subcategorymatch = re.match("\(([ab])\)$", row[2]) if not subcategorymatch: content = FixHTMLEntities(row[2] + " " + row[3]) if subcategory: fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, content)) else: fout.write('\t\t<item>%s</item>\n' % content) else: subcategory = subcategorymatch.group(1) fout.write('\t\t(%s)\n' % subcategory) fout.write('\t\t<item subcategory="%s">%s</item>\n' % (subcategory, FixHTMLEntities(row[3]))) else: print row raise ContextException, "Unknown row type match, length %d" % ( len(row)) if category: fout.write('\t</category>\n') if needmemberend: fout.write('</regmem>\n') needmemberend = False membersetexpect = set( [m['person_id'] for m in memberList.mpslistondate(sdate)]) # check for missing/extra entries missing = membersetexpect.difference(memberset) if len(missing) > 0: print "Missing %d MP entries:\n" % len(missing), missing extra = memberset.difference(membersetexpect) if len(extra) > 0: print "Extra %d MP entries:\n" % len(extra), extra fout.write("</publicwhip>\n")
def MpList(fsm, vote, stampurl, sdate): # Merge lone listed constituencies onto end of previous line newfsm = [] for fss in fsm: if not fss: continue if reconstnm.match(fss): # print "constnm only %s appending to previous line %s" % (fss, newfsm[-1]) newfsm[-1] += " " + fss else: newfsm.append(fss) res = [] pfss = '' multimatches = {} # from tuple to number of matches accounted, and name for fss in newfsm: #print "fss ", fss # break up concattenated lines # Beresford, Sir PaulBlunt, Crispin while re.search('\S', fss): # there was an & in [A-Z] on line below, but it broke up this incorrectly: # Simon, Siôn <i>(B'ham Erdington)</i> regsep = re.search('(.*?,.*?(?:[a-z]|</i>|\.|\)))([A-Z].*?,.*)$', fss) regsep2 = re.match('(.*?,.*?) ([A-Z].*?,.*)$', fss) if regsep and not re.search(' Mc$', regsep.group(1)): fssf = regsep.group(1) fss = regsep.group(2) elif regsep2: fssf = regsep2.group(1) fss = regsep2.group(2) else: fssf = fss fss = '' # check alphabetical - but "rh" and so on confound so don't bother #if pfss and (pfss > fssf): # print pfss, fssf # raise Exception, ' out of alphabetical order %s and %s' % (pfss, fssf) #pfss = fssf # flipround the name # Bradley, rh Keith <i>(Withington)</i> # Simon, Sio(r)n <i>(Withington)</i> #print "fssf ", fssf ginp = reflipname.match(fssf) if ginp: #print "grps ", ginp.groups() fnam = '%s %s' % (ginp.group(2), ginp.group(1)) cons = ginp.group(3) # name not being flipped, is firstname lastname else: ginp = renoflipname.match(fssf) if not ginp: raise ContextException( "No flipped or non-flipped name match (division)", stamp=stampurl, fragment=fssf) fnam = ginp.group(1) cons = ginp.group(2) #print "fss ", fssf (mpid, remadename, remadecons) = memberList.matchfullnamecons(fnam, cons, sdate, alwaysmatchcons=False) if not mpid and remadename == "MultipleMatch": assert type(remadecons) == tuple # actually the list of ids i = len(multimatches.setdefault(remadecons, [])) # the index we work with if i >= len(remadecons): print "Name", fnam, "used too many times for list", remadecons, "where other instances are", multimatches[ remadecons] raise ContextException("Too many instances", stamp=stampurl, fragment=fnam) mpid = remadecons[i] multimatches[remadecons].append(fnam) # appears with multiple matching which is ignorable when both ambiguous people vote on same side of a division #print "For name", fnam, "returning id", mpid, ";", i, " out of ", remadecons elif not mpid and remadename != "MultipleMatch": print "division.py: no match for", fnam, cons, sdate raise ContextException("No match on name", stamp=stampurl, fragment=fnam) #print fnam, " --> ", remadename.encode("latin-1") res.append('\t<mpname person_id="%s" vote="%s">%s</mpname>' % (mpid, vote, FixHTMLEntities(fssf))) # now we have to check if the multimatched names were all exhausted for ids in multimatches: if len(multimatches[ids]) != len(ids): print "Insufficient vote matches on name", multimatches[ ids], "ids taken to", ids raise ContextException("Not enough vote match on ambiguous name", stamp=stampurl, fragment=multimatches[ids][0]) return res
def FilterQuestion(qs, sdate, lords): text = qs.text stampurl = qs.sstampurl # split into paragraphs. The second results is a parallel array of bools (textp, textpindent) = SplitParaIndents(text, stampurl) if not textp: raise ContextException('no paragraphs in result', stamp=stampurl, fragment=text) textn = [] # special case exceptions. Indented text in questions nearly always marks numbered sections # - rarely is it quoted text like this: # 2002-11-07 - happened again. Did a patch. if sdate == '2004-01-05' and len(textp) > 1 and re.search( '"Given that 98.5 per cent', text): # if this happens a lot - do this properly, so the indented bit gets its own paragraph textp = (string.join(textp, " "), ) textpindent = (0, ) # I /think/ this is to match Lords written answers if lords: stext = [] start = 0 if re.match( 'asked Her Majesty('|’|\')s Government|asked the (?i)', textp[0]): stext.append('<p>%s</p>' % FixHTMLEntities(textp[0])) start = 1 for i in range(start, len(textp)): eqnum = ExtractQnum(textp[i], stampurl) stext.append('<p qnum="%s">%s</p>' % (eqnum[1], FixHTMLEntities(eqnum[0]))) return stext # multi-part type if len(textp) > 1: # find the first (1) gbone = re.search('\(1\)', textp[0]) if not gbone: m = re.match( 'To ask the ((Secretary|Minister) of State,? (Ministry of|for( the)?) )?(%s),? (?i)' % '|'.join(parlPhrases.wransmajorheadings.keys()), textp[0]) if not m: raise ContextException('no (1) in first multipart para', fragment=text, stamp=stampurl) textp[0] = textp[0][:m.end()] + '(1) ' + textp[0][m.end():] gbone = re.search('\(1\)', textp[0]) textn.append((textp[0][:gbone.span(0)[0]], '')) eqnum = ExtractQnum(textp[0][gbone.span(0)[1]:], stampurl) textn.append(eqnum) # scan through the rest of the numbered paragraphs for i in range(1, len(textp)): gbnum = re.search('^\((\d+)\)', textp[i]) if not gbnum: raise ContextException('no number match in paragraph', fragment=textp[i], stamp=stampurl) gbnumseq = string.atoi(gbnum.group(1)) # MPS 2007-06-22 Don't care #if gbnumseq != i + 1: # raise ContextException('paragraph numbers not consecutive', fragment=textp[i], stamp=stampurl) eqnum = ExtractQnum(textp[i][gbnum.span(0)[1]:], stampurl) textn.append(eqnum) # single paragraph type else: eqnum = ExtractQnum(textp[0], stampurl) textn.append(eqnum) # put the paragraphs back in together, with their numbering # should do some blocking out of this, especially the "to ask" phrase. pht = PhraseTokenize(qs, textn[0][0]) firstpara = re.sub('</?p[^>]*>', '', pht.GetPara('')) if len(textn) > 1: stext = ['<p>%s</p>' % firstpara] for i in range(1, len(textn)): pht = PhraseTokenize(qs, textn[i][0]) stext.append( '<p class="numindent" qnum="%s">(%d) %s</p>' % (textn[i][1], i, re.sub('</?p[^>]*>', '', pht.GetPara('')))) else: stext = ['<p qnum="%s">%s</p>' % (textn[0][1], firstpara)] return stext
def ParseTable(lstable, stampur): # remove the table bracketing stable = re.match('<table[^>]*>\s*([\s\S]*?)\s*</table>$(?i)', lstable) if not stable: raise ContextException('Missing </table> somewhere...', stamp=stampur, fragment=stable) stable = stable.group(1) if re.search('<table[^>]*>|</table>(?i)', stable): print lstable raise Exception, 'Double <table> start tag in table parse chunk' # break into rows, making sure we can deal with non-closed <tr> symbols sprows = re.split('(<tr[^>]*>[\s\S]*?(?:</tr>|(?=<tr[^>]*>)))(?i)', stable) # build the rows stitle = '' srows = [] for sprow in sprows: trg = re.match('<tr[^>]*>([\s\S]*?)(?:</tr>)?$(?i)', sprow) if trg: srows.append(trg.group(1)) elif re.search('\S', sprow): if (not srows) and (not stitle): stitle = sprow elif not re.match( '(?:</t[dhr]>|</font>|</?tbody>|</?thead>|\s)*$(?i)', sprow): raise ContextException("non-row text", stamp=stampur, fragment=sprow) # take out tags round the title; they're always out of order #print "stitle ", stitle stitle = string.strip( re.sub('</?font[^>]*>|</?p>|</?i>|<br>|<tbody>|</?thead>| (?i)', '', stitle)) ctitle = '' if stitle: ts = re.match( '(?:\s|<b>|<center>)+([\s\S]*?)(?:</b>|</center>)+\s*([\s\S]*?)\s*$(?i)', stitle) if not ts: raise ContextException(' non-standard table title: %s ' % stitle, stamp=stampur, fragment=stitle) Lstitle = ['\t<caption>'] Lstitle.append( FixHTMLEntities(ts.group(1), '</?font[^>]*>|</?p>|\n(?i)', stampurl=stampur)) if ts.group(2): Lstitle.append(' -- ') Lstitle.append( FixHTMLEntities(ts.group(2), '</?font[^>]*>|</?p>|\n(?i)', stampurl=stampur)) Lstitle.append('</caption>') ctitle = string.join(Lstitle, '') # split into header and body for ih in range(len(srows)): if re.search('<td[^>]*>(?i)', srows[ih]): break # construct the text for writing the table res = ['<table>'] if ctitle: res.append(ctitle) if ih > 0: res.append('\t<thead>') for srow in srows[:ih]: res.append(ParseRow(srow, 'th', stampur)) res.append('\t</thead>') res.append('\t<tbody>') for srow in srows[ih:]: res.append(ParseRow(srow, 'td', stampur)) res.append('\t</tbody>') res.append('</table>') return res
def FilterWransSections(text, sdate, lords=False): text = ApplyFixSubstitutions(text, sdate, fixsubs) headspeak = SplitHeadingsSpeakers(text) # break down into lists of headings and lists of speeches (ih, stampurl) = StripWransHeadings(headspeak, sdate) # full list of question batches # We create a list of lists of speeches flatb = [] justhadnewtitle = False # For when they put another "Written Answers to Questions" and date for sht in headspeak[ih:]: # triplet of ( heading, unspokentext, [(speaker, text)] ) headingtxt = stampurl.UpdateStampUrl(string.strip( sht[0])) # we're getting stamps inside the headings sometimes unspoketxt = sht[1] speechestxt = sht[2] # update the stamps from the pre-spoken text if (not re.match('(?:<[^>]*>|\s)*$', unspoketxt)): raise ContextException("unspoken text under heading in wrans", stamp=stampurl, fragment=unspoketxt) stampurl.UpdateStampUrl(unspoketxt) # headings become one unmarked paragraph of text # detect if this is a major heading if not re.search('[a-z]', headingtxt) and not speechestxt: if not parlPhrases.wransmajorheadings.has_key(headingtxt): raise ContextException( "unrecognized major heading, please add to parlPhrases.wransmajorheadings (a)", fragment=headingtxt, stamp=stampurl) majheadingtxtfx = parlPhrases.wransmajorheadings[ headingtxt] # no need to fix since text is from a map. qbH = qspeech('nospeaker="true"', majheadingtxtfx, stampurl) qbH.typ = 'major-heading' qbH.stext = [majheadingtxtfx] flatb.append(qbH) continue elif not speechestxt and sdate > '2006-05-07': if headingtxt == 'Written Answers to Questions': justhadnewtitle = True continue if not parlPhrases.wransmajorheadings.has_key(headingtxt.upper()): if justhadnewtitle: justhadnewtitle = False continue raise ContextException( "unrecognized major heading, please add to parlPhrases.wransmajorheadings (b)", fragment=headingtxt, stamp=stampurl) majheadingtxtfx = parlPhrases.wransmajorheadings[ headingtxt.upper()] # no need to fix since text is from a map. qbH = qspeech('nospeaker="true"', majheadingtxtfx, stampurl) qbH.typ = 'major-heading' qbH.stext = [majheadingtxtfx] flatb.append(qbH) justhadnewtitle = False continue elif not speechestxt: raise ContextException('broken heading %s' % headingtxt, stamp=stampurl, fragment=headingtxt) # non-major heading; to a question batch if parlPhrases.wransmajorheadings.has_key(headingtxt): raise Exception, ' speeches found in major heading %s' % headingtxt headingtxtfx = FixHTMLEntities(headingtxt) headingmark = 'nospeaker="true"' bNextStartofQ = True # go through each of the speeches in a block and put it into our batch of speeches qnums = [] # used to account for spurious qnums seen in answers for ss in speechestxt: qb = qspeech(ss[0], ss[1], stampurl) #print ss[0] + " " + stampurl.stamp lqnums = re.findall('\[(?:HL)?(\d+)R?\]', ss[1]) # question posed if re.match('(?:<[^>]*?>|\s)*?(to ask|asked (Her Majesty('|’|\')s Government|the ))(?i)', qb.text) or \ re.search('<wrans-question>', qb.text): qb.text = qb.text.replace('<wrans-question>', '') qb.typ = 'ques' # put out the heading for this question-reply block. # we don't assert true since we can have multiple questions answsered in a block. if bNextStartofQ: # put out a heading # we need to make the heading of from the same stampurl as the first question qbh = qspeech(headingmark, headingtxtfx, qb.sstampurl) qbh.typ = 'minor-heading' qbh.stext = [headingtxtfx] flatb.append(qbh) bNextStartofQ = False # used to show that the subsequent headings in this block have been created, # and weren't in the original text. headingmark = 'nospeaker="true" inserted-heading="true"' qnums = lqnums # reset the qnums count else: qnums.extend(lqnums) qb.stext = FilterQuestion(qb, sdate, lords) if not lqnums: errmess = ' <p class="error">Question number missing in Hansard, possibly truncated question.</p> ' qb.stext.append(errmess) flatb.append(qb) # do the reply else: if bNextStartofQ: raise ContextException('start of question expected', stamp=qb.sstampurl, fragment=qb.text) qb.typ = 'reply' # this case is so rare we flag them in the corrections of the html with this tag if re.search("\<another-answer-to-follow\>", qb.text): qb.text = qb.text.replace("<another-answer-to-follow>", "") else: bNextStartofQ = True # check against qnums which are sometimes repeated in the answer code # Don't care if qnum is given in an answer! #for qn in lqnums: # # sometimes [n] is an enumeration or part of a title # nqn = string.atoi(qn) # if (not qnums.count(qn)) and (nqn > 100) and ((nqn < 1900) or (nqn > 2010)): # if qb.text.find("<ok-extra-qnum>") >= 0: # qb.text = qb.text.replace("<ok-extra-qnum>", "", 1) # else: # raise ContextException('unknown qnum %s present in answer, make it clear' % qn, stamp = qb.sstampurl, fragment = qb.text) qb.stext = FilterReply(qb) flatb.append(qb) if not bNextStartofQ: print speechestxt # Note - not sure if this should be speechestxt[-1][1] here. Does what I want for now... raise ContextException("missing answer to question", stamp=stampurl, fragment=speechestxt[-1][1]) # we now have everything flattened out in a series of speeches, # where some of the speeches are headings (inserted and otherwise). return flatb