def GrabWestminDivisionInterruptProced(qbp, rawtext): if len(qbp.stext) < 3: return None iskip = 0 if re.search("italic.*?>on resuming&\S*</p>(?i)", qbp.stext[-1]): if not re.search("italic.*?>(?:sitting )?(?:suspended|adjourned)(?: for (?:a division|divisions) in the house)?[\.\s]*(?i)", qbp.stext[-2]): raise ContextException('failed to detect sitting suspended interruption', fragment=qbp.stext[-2] ) iskip = -2 elif re.search("italic.*?>sitting suspended(?: for| until| till|\.)(?i)", qbp.stext[-1]): iskip = -1 # copy the lines into a non-speaking paragraph. if iskip: dumtext = re.sub('<p>(?:<stamp aname="[^"]*?"/>)?<i>sitting suspended.*(?si)','',rawtext) # Why didn't I make a note of why I did the following lines? Must be something to do with the timestamps... s = copy.copy(qbp.sstampurl) qbdp = qspeech('nospeaker="true"', dumtext, s) qbdp = qspeech('nospeaker="true"', "", s) qbdp.typ = 'speech' qbdp.stext = qbp.stext[iskip:] # trim back the given one by two lines qbp.stext = qbp.stext[:iskip] return qbdp return None
def NewGrabLordDivisionProced(qbp, qbd): if not re.match("speech|motion", qbp.typ) or len(qbp.stext) < 1: print qbp.stext raise ContextException("previous to division not speech", stamp=qbp.sstampurl) iskim = 1 while iskim <= len(qbp.stext) and not redivisionon.match(qbp.stext[-iskim]): iskim = iskim + 1 if iskim > len(qbp.stext): raise ContextException("Could not find Division 'title'", stamp=qbp.sstampurl) hdg = renewlorddiv.match(qbp.stext[-iskim+1]) if not hdg: print qbp.stext[-iskim+1] raise ContextException("no totals before division", stamp=qbp.sstampurl) # if previous thing is already a no-speaker, we don't need to break it out # (the coding on the question put is complex and multilined) if re.search('nospeaker="true"', qbp.speaker): qbp.stext = SubsPWtextset(qbp.stext) return None # copy the two lines into a non-speaking paragraph. qbdp = qspeech('nospeaker="true"', "", qbp.sstampurl) qbdp.typ = 'speech' qbdp.stext = SubsPWtextset(qbp.stext[-iskim:]) # trim back the given one by two lines qbp.stext = qbp.stext[:-iskim] return qbdp
def GrabLordDivisionProced(qbp, qbd): if not re.match("speech|motion", qbp.typ) or len(qbp.stext) < 1: print qbp.stext raise ContextException("previous to division not speech", stamp=qbp.sstampurl) hdg = relorddiv.match(qbp.stext[-1]) if not hdg: print qbp.stext[-1] raise ContextException("no lordships divided before division", stamp=qbp.sstampurl) # if previous thing is already a no-speaker, we don't need to break it out # (the coding on the question put is complex and multilined) if re.search('nospeaker="true"', qbp.speaker): qbp.stext = SubsPWtextset(qbp.stext) return None # look back at previous paragraphs and skim off a part of what's there # to make a non-spoken bit reporting on the division. iskim = 1 if not resaidamend.match(qbp.stext[-2]): print qbp.stext[-2] raise ContextException("no on said amendment", stamp=qbp.sstampurl, fragment=qbp.stext[-2]) iskim = 2 # copy the two lines into a non-speaking paragraph. qbdp = qspeech('nospeaker="true"', "", qbp.sstampurl) qbdp.typ = 'speech' qbdp.stext = SubsPWtextset(qbp.stext[-iskim:]) # trim back the given one by two lines qbp.stext = qbp.stext[:-iskim] return qbdp
def DivisionParsingPart(divno, unspoketxt, stampurl, sdate): # find the ending of the division and split it off. gquesacc = re.search(regenddiv, unspoketxt) if gquesacc: divtext = unspoketxt[: gquesacc.start(1)] unspoketxt = unspoketxt[gquesacc.start(1) :] if re.match(strexplicitenddiv, unspoketxt): # strip off signal tag unspoketxt = unspoketxt[len(strexplicitenddiv) :] else: divtext = unspoketxt print unspoketxt print "division missing %s" % regenddiv print "try inserting <explicit-end-division>" unspoketxt = "" # Add a division object (will contain votes and motion text) spattr = 'nospeaker="true" divdate="%s" divnumber="%s"' % (sdate, divno) qbd = qspeech(spattr, divtext, stampurl) qbd.typ = "division" # this type field seems easiest way # filtering divisions here because we may need more sophisticated detection # of end of division than the "Question accordingly" marker. qbd.stext = FilterDivision(qbd.text, stampurl, sdate) return (unspoketxt, qbd)
def FilterWMSSections(text, sdate, lords=False): text = ApplyFixSubstitutions(text, sdate, fixsubs) # split into list of triples of (heading, pre-first speech text, [ (speaker, text) ]) headspeak = SplitHeadingsSpeakers(text) (ih, stampurl) = StripWMSHeadings(headspeak, sdate, lords) flatb = [ ] for sht in headspeak[ih:]: try: headingtxt = stampurl.UpdateStampUrl(string.strip(sht[0])) # we're getting stamps inside the headings sometimes unspoketxt = sht[1] speechestxt = sht[2] if (not re.match('(?:<[^>]*>|\s| )*$', unspoketxt)): raise ContextException("unspoken text under heading in WMS", stamp=stampurl, fragment=unspoketxt) qbh = NormalHeadingPart(headingtxt, stampurl, sdate, speechestxt, lords) flatb.append(qbh) stampurl.UpdateStampUrl(unspoketxt) for ss in speechestxt: # Put everything in XML, de-dupe elsewhere # if lords and re.search('My (?:right )?honourable friend .*? has made the following (?:Written )?Ministerial Statement', ss[1]): # continue qb = qspeech(ss[0], ss[1], stampurl) qb.typ = 'speech' FilterWMSSpeech(qb) flatb.append(qb) except ContextException, e: raise except Exception, e: # add extra stamp info to the exception raise ContextException(str(e), stamp=stampurl)
def NormalHeadingPart(headingtxt, stampurl, sdate, speechestxt, lords): bmajorheading = False if lords: bmajorheading = False elif not re.search('[a-z]', headingtxt) and headingtxt != 'BNFL': bmajorheading = True elif re.search('_dpthd', stampurl.aname) or re.search('_head', stampurl.aname): bmajorheading = True if re.search('_sbhd', stampurl.aname): bmajorheading = False if sdate>'2006-05-07': # Assume major heading if no speeches in new style bmajorheading = not speechestxt if bmajorheading: if not parlPhrases.wransmajorheadings.has_key(headingtxt.upper()): raise ContextException("unrecognized major heading, please add to parlPhrases.wransmajorheadings (a)", fragment = headingtxt, stamp = stampurl) headingtxt = parlPhrases.wransmajorheadings[headingtxt.upper()] # no need to fix since text is from a map. headingtxtfx = FixHTMLEntities(headingtxt) qb = qspeech('nospeaker="true"', headingtxtfx, stampurl) if bmajorheading: qb.typ = 'major-heading' else: qb.typ = 'minor-heading' # headings become one unmarked paragraph of text qb.stext = [ headingtxtfx ] return qb
def LordsHeadingPart(headingtxt, stampurl, major): headingtxtfx = FixHTMLEntities(headingtxt) qb = qspeech('nospeaker="true"', headingtxtfx, stampurl) if major and stampurl.sdate > '2008-12-01': qb.typ = 'major-heading' else: qb.typ = 'minor-heading' # headings become one unmarked paragraph of text qb.stext = [ headingtxtfx ] return qb
def LordsDivisionParsingPart(divno, unspoketxt, stampurl, sdate): # find the ending of the division and split it off. gquesacc = re.search(regenddiv, unspoketxt) if gquesacc: divtext = unspoketxt[:gquesacc.start(1)] unspoketxt = unspoketxt[gquesacc.start(1):] unspoketxt = re.sub(':ENDDIVISION:', '', unspoketxt) elif sdate > '2008-12-01': # Sigh XXX m = re.match('.*, [A-Z]\.</p>(?s)', unspoketxt) if not m: m = re.match('.*<br>(?s)', unspoketxt) divtext = m.group() unspoketxt = unspoketxt[m.end():] else: divtext = unspoketxt print "division missing %s" % regenddiv print unspoketxt print "is there a linefeed before the </center> on the CONTENTS?" raise ContextException("Division missing resolved in the", stamp=stampurl, fragment="Division") # newly added unspoketxt = '' divtext = re.sub(' style="margin-bottom:[^"]*"', '', divtext) # Add a division object (will contain votes and motion text) spattr = 'nospeaker="true" divdate="%s" divnumber="%s"' % (sdate, divno) qbd = qspeech(spattr, divtext, stampurl) qbd.typ = 'division' # this type field seems easiest way if not stampurl.timestamp: raise ContextException("Division missing any timestamps; need to put one in to make it consistent. like <h5>2.44 pm</h5>", stamp=stampurl, fragment="Division") # filtering divisions here because we may need more sophisticated detection # of end of division than the "Question accordingly" marker. qbd.stext = LordsFilterDivision(qbd.text, stampurl, sdate) return (unspoketxt, qbd)
def LordsFilterSections(text, sdate): # deal with one exceptional case of indenting if sdate == "2005-10-26": l = len(text) text = re.sub("<ul><ul>(<ul>)?", "<ul>", text) text = re.sub("</ul></ul>(</ul>)?", "</ul>", text) # regsection1 = '<h\d><center>.*?\s*</center></h\d>' in splitheadingsspeakers.py print "Duplicate <ul>s removed and <center> sorted on %s which shortened text by %d" % (sdate, l - len(text)) # split into list of triples of (heading, pre-first speech text, [ (speaker, text) ]) headspeak = SplitHeadingsSpeakers(text) # break down into lists of headings and lists of speeches (ih, stampurl) = StripLordsDebateHeadings(headspeak, sdate) if ih == None: return # loop through each detected heading and the detected partitioning of speeches which follow. # this is a flat output of qspeeches, some encoding headings, and some divisions. # see the typ variable for the type. flatb = [ ] for sht in headspeak[ih:]: # triplet of ( heading, unspokentext, [(speaker, text)] ) headingtxt = stampurl.UpdateStampUrl(string.strip(sht[0])) # we're getting stamps inside the headings sometimes unspoketxt = sht[1] speechestxt = sht[2] headingmajor = sht[3] # the heading detection, as a division or a heading speech object # detect division headings gdiv = re.search('Division No\.(?:\s| )+(\d+)', headingtxt) assert not re.match("(?:NOT-)?CONTENTS", headingtxt) # heading type if not gdiv: qbh = LordsHeadingPart(headingtxt, stampurl, headingmajor) # ram together minor headings into previous ones which have no speeches if qbh.typ == 'minor-heading' and len(flatb) > 0 and flatb[-1].typ == 'minor-heading': flatb[-1].stext.append(" — ") flatb[-1].stext.extend(qbh.stext) # ram together minor headings into previous ones which have no speeches elif sdate>'2008-12-01' and qbh.typ == 'minor-heading' and len(flatb) > 0 and flatb[-1].typ == 'major-heading': flatb[-1].stext.append(" — ") flatb[-1].stext.extend(qbh.stext) # otherwise put out this heading else: flatb.append(qbh) # division type else: (unspoketxt, qbd) = LordsDivisionParsingPart(string.atoi(gdiv.group(1)), unspoketxt, stampurl, sdate) # grab some division text off the back end of the previous speech # and wrap into a new no-speaker speech if sdate >= '2008-12-01': qbdp = NewGrabLordDivisionProced(flatb[-1], qbd) else: qbdp = GrabLordDivisionProced(flatb[-1], qbd) if qbdp: flatb.append(qbdp) flatb.append(qbd) # continue and output unaccounted for unspoken text occuring after a # division, or after a heading if (not re.match('(?:<[^>]*>|\s)*$', unspoketxt)): qb = qspeech('nospeaker="true"', unspoketxt, stampurl) qb.typ = 'speech' flatb.extend(FilterLordsSpeech(qb)) # there is no text; update from stamps if there are any else: stampurl.UpdateStampUrl(unspoketxt) # go through each of the speeches in a block and put it into our batch of speeches for ss in speechestxt: qb = qspeech(ss[0], ss[1], stampurl) qb.typ = 'speech' flatb.extend(FilterLordsSpeech(qb)) # we now have everything flattened out in a series of speeches return flatb
def FilterLordsSpeech(qb): # pull in the normal filtering that gets done on debate speeches # does the paragraph indents and tables. Maybe should be inlined for lords FilterDebateSpeech(qb) # the colon attr is blank or has a : depending on what was there after the name that was matched ispeechstartp1 = 0 # plus 1 # no colonattr or colon, must be making a speech recol = re.search('colon="(:?)"', qb.speaker) bSpeakerExists = not re.match('nospeaker="true"', qb.speaker) if bSpeakerExists and (not recol or recol.group(1)): # text of this kind at the begining should not be spoken, assume there wasn't a colon if not re.search("<p>(?:moved|asked|rose to move,) (?i)", qb.stext[0]) or re.search("<p>moved formally(?i)", qb.stext[0]): ispeechstartp1 = 1 # 0th paragraph is speech text res = [ ] # output list preparagraphtype = "" if bSpeakerExists and (ispeechstartp1 == 0): if re.match("<p>asked Her Majesty's Government|<p>asked the|<p>—Took the Oath", qb.stext[0]): preparagraphtype = "asked" ispeechstartp1 = SearchForNobleLordSaid(qb, preparagraphtype) if ispeechstartp1 == len(qb.stext): # No Noble Lord said, the usual ispeechstartp1 = 1 if ispeechstartp1 != 1: print "Noble Lord Said on ", ispeechstartp1, "paragraph" raise ContextException("Noble Lord Said missing in second paragraph", stamp=qb.sstampurl) # ensure that the noble lord said doesn't say an amendment withdrawn assert not MatchPWmotionStuff(qb, ispeechstartp1) elif re.match("<p>rose to (?:ask|call|draw attention|consider)", qb.stext[0]): preparagraphtype = "asked" ispeechstartp1 = SearchForNobleLordSaid(qb, preparagraphtype) if ispeechstartp1 not in [1, 2]: print "Noble Lord Said on ", ispeechstartp1, "paragraph" raise ContextException("Noble Lord Said missing in second paragraph", stamp=qb.sstampurl) # ensure that the noble lord said doesn't say an amendment withdrawn assert not MatchPWmotionStuff(qb, ispeechstartp1) # identify a writ of summons (single line) elif re.match("<p>(?:[\s,]*having received a [Ww]rit of [Ss]ummons .*?)?[Tt]ook the [Oo]ath\.</p>$", qb.stext[0]): assert len(qb.stext) == 1 qb.stext[0] = re.sub('^<p>', '<p pwmotiontext="summons">', qb.stext[0]) # cludgy; already have the <p>-tag embedded in the string res.append(qb) return res # bail out elif re.search("having been created.*?Was, in (his|her) robes, introduced", qb.stext[0]): assert len(qb.stext) == 1 qbunspo = qspeech('nospeaker="true"', "", qb.sstampurl) qbunspo.typ = 'speech' qbunspo.stext = qb.stext qbunspo.stext[0] = re.sub('^<p>', '<p pwmotiontext="introduced">', qbunspo.stext[0]) res.append(qbunspo) return res elif re.match("<p>—Took the Oath", qb.stext[0]): assert False # identify a moved amendment elif re.match("<p>moved,? |<p>Amendments? |<p>had given notice|<p>(?:rose )?to move|<p>had given his intention", qb.stext[0]): # find where the speech begins, and strip out "The noble lord said:" preparagraphtype = "moved" ispeechstartp1 = SearchForNobleLordSaid(qb, preparagraphtype) # everything up to this point is non-speech assert ispeechstartp1 > 0 qbprev = qspeech(qb.speaker, "", qb.sstampurl) qbprev.typ = 'speech' qbprev.stext = qb.stext[:ispeechstartp1] res.append(qbprev) if ispeechstartp1 == len(qb.stext): return res # upgrade the spoken part qb.speaker = string.replace(qb.speaker, 'colon=""', 'colon=":"') del qb.stext[:ispeechstartp1] assert qb.stext ispeechstartp1 = 1 # the spoken text must reach at least here (after the line, "The noble lord said:") # error, no moved amendment found else: print qb.stext print "no moved amendment; is a colon missing after the name?" raise ContextException("missing moved amendment", stamp=qb.sstampurl) # advance to place where non-speeches happen if ispeechstartp1 > len(qb.stext): print "ispeechstartp1 problem; speeches running through", ispeechstartp1, len(qb.stext) print qb.stext raise ContextException("end of speech boundary unclear running through; need to separate paragraphs?", stamp=qb.sstampurl) # a common end of speech is to withdraw an amendment # we go through paragraphs until we match that or some other motion text type statement sAmendmentStatement = None while bSpeakerExists and (ispeechstartp1 < len(qb.stext)): sAmendmentStatement = MatchPWmotionStuff(qb, ispeechstartp1) if sAmendmentStatement: break ispeechstartp1 += 1 # there are no further lines after the widthdrawal if ispeechstartp1 == len(qb.stext): assert not sAmendmentStatement res.append(qb) return res # do the further lines after withdrawal assert (not bSpeakerExists) or sAmendmentStatement # splice off the unspoken text running off from the amendment statements if ispeechstartp1 != 0: qbunspo = qspeech('nospeaker="true"', "", qb.sstampurl) qbunspo.typ = 'speech' qbunspo.stext = qb.stext[ispeechstartp1:] del qb.stext[ispeechstartp1:] res.append(qb) res.append(qbunspo) else: res.append(qb) qbunspo = qb # check that once we begin pwmotion amendment statements, all statements are of this type for i in range(len(qbunspo.stext)): if not re.match('<p', qbunspo.stext[i]): continue sAmendmentStatement = MatchKnownAsPWmotionStuff(qbunspo, i) if not sAmendmentStatement: if IsNotQuiet(): print "UNRECOGNIZED-MOTION-TEXT%s: %s" % (bSpeakerExists and " " or "(*)", qbunspo.stext[i]) sAmendmentStatement = "unrecognized" qbunspo.stext[i] = re.sub('^<p(.*?)>', '<p\\1 pwmotiontext="%s">' % sAmendmentStatement, qbunspo.stext[i]) return res
def LordsFilterSections(text, sdate): # deal with one exceptional case of indenting if sdate == "2005-10-26": l = len(text) text = re.sub("<ul><ul>(<ul>)?", "<ul>", text) text = re.sub("</ul></ul>(</ul>)?", "</ul>", text) # regsection1 = '<h\d><center>.*?\s*</center></h\d>' in splitheadingsspeakers.py print "Duplicate <ul>s removed and <center> sorted on %s which shortened text by %d" % ( sdate, l - len(text)) # split into list of triples of (heading, pre-first speech text, [ (speaker, text) ]) headspeak = SplitHeadingsSpeakers(text) # break down into lists of headings and lists of speeches (ih, stampurl) = StripLordsDebateHeadings(headspeak, sdate) if ih == None: return # loop through each detected heading and the detected partitioning of speeches which follow. # this is a flat output of qspeeches, some encoding headings, and some divisions. # see the typ variable for the type. flatb = [] for sht in headspeak[ih:]: # triplet of ( heading, unspokentext, [(speaker, text)] ) headingtxt = stampurl.UpdateStampUrl(string.strip( sht[0])) # we're getting stamps inside the headings sometimes unspoketxt = sht[1] speechestxt = sht[2] headingmajor = sht[3] # the heading detection, as a division or a heading speech object # detect division headings gdiv = re.search('Division No\.(?:\s| )+(\d+)', headingtxt) assert not re.match("(?:NOT-)?CONTENTS", headingtxt) # heading type if not gdiv: qbh = LordsHeadingPart(headingtxt, stampurl, headingmajor) # ram together minor headings into previous ones which have no speeches if qbh.typ == 'minor-heading' and len( flatb) > 0 and flatb[-1].typ == 'minor-heading': flatb[-1].stext.append(" — ") flatb[-1].stext.extend(qbh.stext) # ram together minor headings into previous ones which have no speeches elif sdate > '2008-12-01' and qbh.typ == 'minor-heading' and len( flatb) > 0 and flatb[-1].typ == 'major-heading': flatb[-1].stext.append(" — ") flatb[-1].stext.extend(qbh.stext) # otherwise put out this heading else: flatb.append(qbh) # division type else: (unspoketxt, qbd) = LordsDivisionParsingPart(string.atoi(gdiv.group(1)), unspoketxt, stampurl, sdate) # grab some division text off the back end of the previous speech # and wrap into a new no-speaker speech if sdate >= '2008-12-01': qbdp = NewGrabLordDivisionProced(flatb[-1], qbd) else: qbdp = GrabLordDivisionProced(flatb[-1], qbd) if qbdp: flatb.append(qbdp) flatb.append(qbd) # continue and output unaccounted for unspoken text occurring after a # division, or after a heading if (not re.match('(?:<[^>]*>|\s)*$', unspoketxt)): qb = qspeech('nospeaker="true"', unspoketxt, stampurl) qb.typ = 'speech' flatb.extend(FilterLordsSpeech(qb)) # there is no text; update from stamps if there are any else: stampurl.UpdateStampUrl(unspoketxt) # go through each of the speeches in a block and put it into our batch of speeches for ss in speechestxt: qb = qspeech(ss[0], ss[1], stampurl) qb.typ = 'speech' flatb.extend(FilterLordsSpeech(qb)) # we now have everything flattened out in a series of speeches return flatb
def FilterWransSections(text, sdate, lords=False): text = ApplyFixSubstitutions(text, sdate, fixsubs) headspeak = SplitHeadingsSpeakers(text) # break down into lists of headings and lists of speeches (ih, stampurl) = StripWransHeadings(headspeak, sdate) # full list of question batches # We create a list of lists of speeches flatb = [] justhadnewtitle = False # For when they put another "Written Answers to Questions" and date for sht in headspeak[ih:]: # triplet of ( heading, unspokentext, [(speaker, text)] ) headingtxt = stampurl.UpdateStampUrl(string.strip( sht[0])) # we're getting stamps inside the headings sometimes unspoketxt = sht[1] speechestxt = sht[2] # update the stamps from the pre-spoken text if (not re.match('(?:<[^>]*>|\s)*$', unspoketxt)): raise ContextException("unspoken text under heading in wrans", stamp=stampurl, fragment=unspoketxt) stampurl.UpdateStampUrl(unspoketxt) # headings become one unmarked paragraph of text # detect if this is a major heading if not re.search('[a-z]', headingtxt) and not speechestxt: if not parlPhrases.wransmajorheadings.has_key(headingtxt): raise ContextException( "unrecognized major heading, please add to parlPhrases.wransmajorheadings (a)", fragment=headingtxt, stamp=stampurl) majheadingtxtfx = parlPhrases.wransmajorheadings[ headingtxt] # no need to fix since text is from a map. qbH = qspeech('nospeaker="true"', majheadingtxtfx, stampurl) qbH.typ = 'major-heading' qbH.stext = [majheadingtxtfx] flatb.append(qbH) continue elif not speechestxt and sdate > '2006-05-07': if headingtxt == 'Written Answers to Questions': justhadnewtitle = True continue if not parlPhrases.wransmajorheadings.has_key(headingtxt.upper()): if justhadnewtitle: justhadnewtitle = False continue raise ContextException( "unrecognized major heading, please add to parlPhrases.wransmajorheadings (b)", fragment=headingtxt, stamp=stampurl) majheadingtxtfx = parlPhrases.wransmajorheadings[ headingtxt.upper()] # no need to fix since text is from a map. qbH = qspeech('nospeaker="true"', majheadingtxtfx, stampurl) qbH.typ = 'major-heading' qbH.stext = [majheadingtxtfx] flatb.append(qbH) justhadnewtitle = False continue elif not speechestxt: raise ContextException('broken heading %s' % headingtxt, stamp=stampurl, fragment=headingtxt) # non-major heading; to a question batch if parlPhrases.wransmajorheadings.has_key(headingtxt): raise Exception, ' speeches found in major heading %s' % headingtxt headingtxtfx = FixHTMLEntities(headingtxt) headingmark = 'nospeaker="true"' bNextStartofQ = True # go through each of the speeches in a block and put it into our batch of speeches qnums = [] # used to account for spurious qnums seen in answers for ss in speechestxt: qb = qspeech(ss[0], ss[1], stampurl) #print ss[0] + " " + stampurl.stamp lqnums = re.findall('\[(?:HL)?(\d+)R?\]', ss[1]) # question posed if re.match('(?:<[^>]*?>|\s)*?(to ask|asked (Her Majesty('|’|\')s Government|the ))(?i)', qb.text) or \ re.search('<wrans-question>', qb.text): qb.text = qb.text.replace('<wrans-question>', '') qb.typ = 'ques' # put out the heading for this question-reply block. # we don't assert true since we can have multiple questions answsered in a block. if bNextStartofQ: # put out a heading # we need to make the heading of from the same stampurl as the first question qbh = qspeech(headingmark, headingtxtfx, qb.sstampurl) qbh.typ = 'minor-heading' qbh.stext = [headingtxtfx] flatb.append(qbh) bNextStartofQ = False # used to show that the subsequent headings in this block have been created, # and weren't in the original text. headingmark = 'nospeaker="true" inserted-heading="true"' qnums = lqnums # reset the qnums count else: qnums.extend(lqnums) qb.stext = FilterQuestion(qb, sdate, lords) if not lqnums: errmess = ' <p class="error">Question number missing in Hansard, possibly truncated question.</p> ' qb.stext.append(errmess) flatb.append(qb) # do the reply else: if bNextStartofQ: raise ContextException('start of question expected', stamp=qb.sstampurl, fragment=qb.text) qb.typ = 'reply' # this case is so rare we flag them in the corrections of the html with this tag if re.search("\<another-answer-to-follow\>", qb.text): qb.text = qb.text.replace("<another-answer-to-follow>", "") else: bNextStartofQ = True # check against qnums which are sometimes repeated in the answer code # Don't care if qnum is given in an answer! #for qn in lqnums: # # sometimes [n] is an enumeration or part of a title # nqn = string.atoi(qn) # if (not qnums.count(qn)) and (nqn > 100) and ((nqn < 1900) or (nqn > 2010)): # if qb.text.find("<ok-extra-qnum>") >= 0: # qb.text = qb.text.replace("<ok-extra-qnum>", "", 1) # else: # raise ContextException('unknown qnum %s present in answer, make it clear' % qn, stamp = qb.sstampurl, fragment = qb.text) qb.stext = FilterReply(qb) flatb.append(qb) if not bNextStartofQ: print speechestxt # Note - not sure if this should be speechestxt[-1][1] here. Does what I want for now... raise ContextException("missing answer to question", stamp=stampurl, fragment=speechestxt[-1][1]) # we now have everything flattened out in a series of speeches, # where some of the speeches are headings (inserted and otherwise). return flatb
def FilterDebateSections(text, sdate, typ): # make the corrections at this level which enables the headings to be resolved. # old style fixing (before patches existed) if typ == "debate": text = ApplyFixSubstitutions(text, sdate, fixsubs) else: assert typ == "westminhall" # this is crap!!! text = re.sub('<ul><ul><ul>(?i)', '<ul>', text) text = re.sub('</ul></ul></ul>(?i)', '</ul>', text) text = re.sub('<h5></h5>(?i)', '', text) # split into list of triples of (heading, pre-first speech text, [ (speaker, text) ]) headspeak = SplitHeadingsSpeakers(text) # break down into lists of headings and lists of speeches if typ == "debate": (ih, stampurl) = StripDebateHeadings(headspeak, sdate) elif typ == "westminhall": (ih, stampurl) = StripWestminhallHeadings(headspeak, sdate) else: assert False # to be for writminstat? # loop through each detected heading and the detected partitioning of speeches which follow. # this is a flat output of qspeeches, some encoding headings, and some divisions. # see the typ variable for the type. flatb = [] state = {} #lastheading = None chair_head = 0 for sht in headspeak[ih:]: try: # triplet of ( heading, unspokentext, [(speaker, text)], major? ) headingtxt = stampurl.UpdateStampUrl(string.strip( sht[0])) # we're getting stamps inside the headings sometimes headingmajor = sht[3] if typ == 'debate' and (headingmajor or sht == headspeak[-1]): # UGH again headingtxt = headingtxt.upper() unspoketxt = sht[1] speechestxt = sht[2] # the heading detection, as a division or a heading speech object # detect division headings gdiv = re.match('(?:<b>)?Division No. (\d+)(?i)', headingtxt) # heading type if not gdiv: # and lastheading != headingtxt: qbh = NormalHeadingPart(headingtxt, stampurl, state, typ) # print "h ", qbh.typ, qbh.stext # ram together minor headings into previous ones which have no speeches if qbh.typ == 'minor-heading' and len( flatb) > 0 and flatb[-1].typ == 'minor-heading': flatb[-1].stext.append(" — ") flatb[-1].stext.extend(qbh.stext) # ram together major headings into previous ones which have no speeches elif qbh.typ == 'major-heading' and len( flatb) > 0 and flatb[-1].typ == 'major-heading': flatb[-1].stext.append(" — ") flatb[-1].stext.extend(qbh.stext) elif qbh.typ == 'minor-heading' and len(flatb) > 0 and flatb[-1].typ == 'major-heading' and \ ( re.search('(Allotted|Allocated) Day(?i)', qbh.stext[-1]) or re.search('^Petition$(?i)', flatb[-1].stext[-1]) ): flatb[-1].stext.append(" — ") flatb[-1].stext.extend(qbh.stext) elif re.search( "(?:sitting suspended(?: for| until| till|\.))|(on resuming&)(?i)", qbh.stext[0]): if len(flatb) > 0 and flatb[-1].typ == 'speech': qb = qspeech('nospeaker="true"', qbh.stext[0], stampurl) qb.typ = 'speech' FilterDebateSpeech(qb) flatb.append(qb) elif re.match( "\[.*? in\s*the\s*Chair\.?\]$(?i)", qbh.stext[0] ) and len(flatb) > 0 and flatb[-1].typ == 'speech': qb = qspeech('nospeaker="true"', qbh.stext[0], stampurl) qb.typ = 'speech' FilterDebateSpeech(qb) flatb.append(qb) # this is where we suck in a trailing "Clause" part of the title that is mistakenly outside the heading. elif (qbh.typ == 'minor-heading' or qbh.typ == 'major-heading' ) and len(flatb) > 0 and flatb[-1].typ == 'speech': mmm = re.match( '\s*<p>\s*((?:New )?(?:clause|schedule) \d+\w?)</p>(?i)', flatb[-1].stext[-1]) if mmm: if IsNotQuiet(): print "Clause/schedule moving", flatb[-1].stext[-1] qbh.stext.insert(0, " — ") qbh.stext.insert(0, mmm.group(1)) flatb[-1].stext = flatb[ -1].stext[:-1] # delete final value # remove an empty speech if not flatb[-1].stext: if IsNotQuiet(): print "removing empty speech after moving 'clause/schedule' out" assert flatb[-1].speaker == 'nospeaker="true"' del flatb[-1] # converting a search into a match, for safety, and double checking else: if re.search( '<p>\s*((?:New )?\s*(?:clause|schedule)\s*\w+)\s*</p>(?i)', flatb[-1].stext[-1]): print flatb[-1].stext[-1] assert False flatb.append(qbh) # otherwise put out this heading else: flatb.append(qbh) # division case elif gdiv: (unspoketxt, qbd) = DivisionParsingPart(string.atoi(gdiv.group(1)), unspoketxt, stampurl, sdate) # grab some division text off the back end of the previous speech # and wrap into a new no-speaker speech qbdp = GrabDivisionProced(flatb[-1], qbd) if qbdp: flatb.append(qbdp) flatb.append(qbd) # write out our file with the report of all divisions PreviewDivisionTextGuess(flatb) #lastheading = headingtxt # continue and output unaccounted for unspoken text occuring after a # division, or after a heading if (not re.match('(?:<[^>]*>|\s)*$', unspoketxt)): qb = qspeech('nospeaker="true"', unspoketxt, stampurl) qb.typ = 'speech' FilterDebateSpeech(qb) flatb.append(qb) # there is no text; update from stamps if there are any else: stampurl.UpdateStampUrl(unspoketxt) # go through each of the speeches in a block and put it into our batch of speeches for ss in speechestxt: qb = qspeech(ss[0], ss[1], stampurl) qb.typ = 'speech' FilterDebateSpeech(qb, bDebateBegToMove=True) qbdp = GrabWestminDivisionInterruptProced( qb, ss[1]) # captures tail off westminster hall speeches flatb.append(qb) if qbdp: flatb.append(qbdp) except ContextException, e: raise
def GrabDivisionProced(qbp, qbd): if qbp.typ != 'speech' or len(qbp.stext) < 1: # this is that crazy correction one if qbp.sstampurl.sdate == '2003-12-18': return None print qbp.stext raise Exception, "previous to division not speech" qbp.stext[-1] = re.sub(' </i><i> ', ' ', qbp.stext[-1]) qbp.stext[-1] = re.sub('</i><i> ', ' ', qbp.stext[-1]) hdg = rehousediv.match(qbp.stext[-1]) if not hdg: hdg_a = rehousediv_a.match(qbp.stext[-2]) hdg_b = rehousediv_b.match(qbp.stext[-1]) if hdg_a and hdg_b: hdg = hdg_b elif hdg_b: # They are occasionally putting "The" "Committee" # "divided" in two or three separate paragraphs two_prev = re.sub('</p><p[^>]*>', '', ''.join(qbp.stext[-3:-1])) three_prev = re.sub('</p><p[^>]*>', '', ''.join(qbp.stext[-4:-1])) if rehousediv_a.match(three_prev): qbp.stext = qbp.stext[:-4] + [ three_prev, qbp.stext[-1] ] hdg = hdg_b elif rehousediv_a.match(two_prev): qbp.stext = qbp.stext[:-3] + [ two_prev, qbp.stext[-1] ] hdg = hdg_b if not hdg: if rehousediv_a.match(qbp.stext[-4]) and rehousediv_b.match(qbp.stext[-3]) and rehousediv_england.match(qbp.stext[-2]): hdg = hdg_b if not hdg: hdg = redivshouldappear.match(qbp.stext[-1]) if not hdg: # another correction one if qbp.sstampurl.sdate != '2003-09-16': raise ContextException, "no house divided before division: %s" % qbp.stext[-1] return None # if previous thing is already a no-speaker, we don't need to break it out # (the coding on the question put is complex and multilined) if re.search('nospeaker="true"', qbp.speaker): qbp.stext = SubsPWtextset(qbp.stext) return None # look back at previous paragraphs and skim off a part of what's there # to make a non-spoken bit reporting on the division. iskim = 1 if re.search('Serjeant at Arms', qbp.stext[-2]): pass else: while len(qbp.stext) >= iskim: if reqput.match(qbp.stext[-iskim]): break iskim += 1 # haven't found a question put before we reach the front if len(qbp.stext) < iskim: iskim = 1 # VALID in 99% of cases: raise Exception, "no question put before division" # copy the two lines into a non-speaking paragraph. qbdp = qspeech('nospeaker="true"', "", qbp.sstampurl) qbdp.typ = 'speech' qbdp.stext = SubsPWtextset(qbp.stext[-iskim:]) # trim back the given one by two lines qbp.stext = qbp.stext[:-iskim] return qbdp
def FilterDebateSections(text, sdate, typ): # make the corrections at this level which enables the headings to be resolved. # old style fixing (before patches existed) if typ == "debate": text = ApplyFixSubstitutions(text, sdate, fixsubs) else: assert typ == "westminhall" # this is crap!!! text = re.sub('<ul><ul><ul>(?i)', '<ul>', text) text = re.sub('</ul></ul></ul>(?i)', '</ul>', text) text = re.sub('<h5></h5>(?i)', '', text) # split into list of triples of (heading, pre-first speech text, [ (speaker, text) ]) headspeak = SplitHeadingsSpeakers(text) # break down into lists of headings and lists of speeches if typ == "debate": (ih, stampurl) = StripDebateHeadings(headspeak, sdate) elif typ == "westminhall": (ih, stampurl) = StripWestminhallHeadings(headspeak, sdate) else: assert False # to be for writminstat? # loop through each detected heading and the detected partitioning of speeches which follow. # this is a flat output of qspeeches, some encoding headings, and some divisions. # see the typ variable for the type. flatb = [ ] state = {} #lastheading = None chair_head = 0 for sht in headspeak[ih:]: try: # triplet of ( heading, unspokentext, [(speaker, text)], major? ) headingtxt = stampurl.UpdateStampUrl(string.strip(sht[0])) # we're getting stamps inside the headings sometimes headingmajor = sht[3] if typ == 'debate' and (headingmajor or sht == headspeak[-1]): # UGH again headingtxt = headingtxt.upper() unspoketxt = sht[1] speechestxt = sht[2] # the heading detection, as a division or a heading speech object # detect division headings gdiv = re.match('(?:<b>)?Division No. (\d+)(?i)', headingtxt) # heading type if not gdiv: # and lastheading != headingtxt: qbh = NormalHeadingPart(headingtxt, stampurl, state, typ) # print "h ", qbh.typ, qbh.stext # ram together minor headings into previous ones which have no speeches if qbh.typ == 'minor-heading' and len(flatb) > 0 and flatb[-1].typ == 'minor-heading': flatb[-1].stext.append(" — ") flatb[-1].stext.extend(qbh.stext) # ram together major headings into previous ones which have no speeches elif qbh.typ == 'major-heading' and len(flatb) > 0 and flatb[-1].typ == 'major-heading': flatb[-1].stext.append(" — ") flatb[-1].stext.extend(qbh.stext) elif qbh.typ == 'minor-heading' and len(flatb) > 0 and flatb[-1].typ == 'major-heading' and \ ( re.search('Allotted Day(?i)', qbh.stext[-1]) or re.search('^Petition$(?i)', flatb[-1].stext[-1]) ): flatb[-1].stext.append(" — ") flatb[-1].stext.extend(qbh.stext) elif re.search("(?:sitting suspended(?: for| until| till|\.))|(on resuming&)(?i)", qbh.stext[0]): if len(flatb) > 0 and flatb[-1].typ == 'speech': qb = qspeech('nospeaker="true"', qbh.stext[0], stampurl) qb.typ = 'speech' FilterDebateSpeech(qb) flatb.append(qb) elif re.match("\[.*? in\s*the\s*Chair\.?\]$(?i)", qbh.stext[0]) and len(flatb) > 0 and flatb[-1].typ == 'speech': qb = qspeech('nospeaker="true"', qbh.stext[0], stampurl) qb.typ = 'speech' FilterDebateSpeech(qb) flatb.append(qb) # this is where we suck in a trailing "Clause" part of the title that is mistakenly outside the heading. elif (qbh.typ == 'minor-heading' or qbh.typ == 'major-heading') and len(flatb) > 0 and flatb[-1].typ == 'speech': mmm = re.match('\s*<p>\s*((?:New )?(?:clause|schedule) \d+\w?)</p>(?i)', flatb[-1].stext[-1]) if mmm: if IsNotQuiet(): print "Clause/schedule moving", flatb[-1].stext[-1] qbh.stext.insert(0, " — ") qbh.stext.insert(0, mmm.group(1)) flatb[-1].stext = flatb[-1].stext[:-1] # delete final value # remove an empty speech if not flatb[-1].stext: if IsNotQuiet(): print "removing empty speech after moving 'clause/schedule' out" assert flatb[-1].speaker == 'nospeaker="true"' del flatb[-1] # converting a search into a match, for safety, and double checking else: if re.search('<p>\s*((?:New )?\s*(?:clause|schedule)\s*\w+)\s*</p>(?i)', flatb[-1].stext[-1]): print flatb[-1].stext[-1] assert False flatb.append(qbh) # otherwise put out this heading else: flatb.append(qbh) # division case elif gdiv: (unspoketxt, qbd) = DivisionParsingPart(string.atoi(gdiv.group(1)), unspoketxt, stampurl, sdate) # grab some division text off the back end of the previous speech # and wrap into a new no-speaker speech qbdp = GrabDivisionProced(flatb[-1], qbd) if qbdp: flatb.append(qbdp) flatb.append(qbd) # write out our file with the report of all divisions PreviewDivisionTextGuess(flatb) #lastheading = headingtxt # continue and output unaccounted for unspoken text occuring after a # division, or after a heading if (not re.match('(?:<[^>]*>|\s)*$', unspoketxt)): qb = qspeech('nospeaker="true"', unspoketxt, stampurl) qb.typ = 'speech' FilterDebateSpeech(qb) flatb.append(qb) # there is no text; update from stamps if there are any else: stampurl.UpdateStampUrl(unspoketxt) # go through each of the speeches in a block and put it into our batch of speeches for ss in speechestxt: qb = qspeech(ss[0], ss[1], stampurl) qb.typ = 'speech' FilterDebateSpeech(qb, bDebateBegToMove=True) qbdp = GrabWestminDivisionInterruptProced(qb,ss[1]) # captures tail off westminster hall speeches flatb.append(qb) if qbdp: flatb.append(qbdp) except ContextException, e: raise
def NormalHeadingPart(headingtxt, stampurl, state, typ): # This is an attempt at major heading detection. # The main wrap code spots adjournment debates, and does its best with some procedural things # But it's pretty flawed Also, Oral questions heading is a super-major heading, # so doesn't fit into the scheme. # remove junk italic settings that appear in the today pages headingtxt = re.sub("</?(?:i|sup)>(?i)", "", headingtxt) # detect if this is a major heading and record it in the correct variable bmajorheading = False boralheading = False binsertedheading = False if re.search('-- lost heading --(?i)', headingtxt): binsertedheading = True # Oral question are really a major heading elif re.match("Oral Answers to Questions(?i)", headingtxt): boralheading = True # Check if there are any other spellings of "Oral Answers to Questions" with a loose match elif re.search('oral(?i)', headingtxt) and re.search('ques(?i)', headingtxt) and (not re.search(" Not ", headingtxt)) and \ (not re.search("electoral", headingtxt)) and \ stampurl.sdate not in ("2002-06-11", "2012-02-09"): # have a genuine title with Oral in it print headingtxt raise ContextException('Oral question match not precise enough', stamp=stampurl, fragment=headingtxt) # All upper case headings - UGH elif not re.search('[a-z]', headingtxt) and not re.match('[A-Z\d/]+[\d/][A-Z\d/]+$', headingtxt) and not \ ('remaining_private_bills' in state and re.search(' Bill$(?i)', headingtxt)): bmajorheading = True elif 'just_had_points_of_order' in state: bmajorheading = True del state['just_had_points_of_order'] # If this is labeled major, then it gets concatenated with the # subsequent major heading. It's kind of a procedural info about the # running of things, so fair to have it as a minor heading alone. elif re.match("\[.*? in\s*the\s*Chair\.?\]$(?i)", headingtxt): bmajorheading = False elif re.search("in\s*the\s*chair(?i)", headingtxt): print headingtxt raise ContextException('in the chair match not precise enough', stamp=stampurl, fragment=headingtxt) # Other major headings, marked by _head in their anchor tag elif re.search('"topichd_|"hd_|_head', stampurl.aname): bmajorheading = True # Wah if stampurl.sdate > '2006-05-07': if re.match("(Private business|Business of the House|Orders of the day|Opposition Day|Deferred Division|Petition)(?i)", headingtxt): bmajorheading = True if re.match("Points? of Order(?i)", headingtxt): bmajorheading = True state['just_had_points_of_order'] = True if re.match("Remaining Private Members[^ ]* Bills(?i)", headingtxt): bmajorheading = True state['remaining_private_bills'] = True # we're not writing a block for division headings # write out block for headings headingtxtfx = FixHTMLEntities(headingtxt) try: assert not re.search("[<>]", headingtxtfx), headingtxtfx # an assertion in gidmatching except AssertionError: raise ContextException('Tag found in heading text', stamp=stampurl, fragment=headingtxt) qb = qspeech('nospeaker="true"', headingtxtfx, stampurl) if typ == 'westminhall': qb.typ = 'minor-heading' elif binsertedheading: qb.typ = 'inserted-heading' elif boralheading: qb.typ = 'oral-heading' elif bmajorheading: qb.typ = 'major-heading' else: qb.typ = 'minor-heading' # headings become one unmarked paragraph of text qb.stext = [ headingtxtfx ] return qb
def NormalHeadingPart(headingtxt, stampurl, state, typ): # This is an attempt at major heading detection. # The main wrap code spots adjournment debates, and does its best with some procedural things # But it's pretty flawed Also, Oral questions heading is a super-major heading, # so doesn't fit into the scheme. # remove junk italic settings that appear in the today pages headingtxt = re.sub("</?(?:i|sup)>(?i)", "", headingtxt) # detect if this is a major heading and record it in the correct variable bmajorheading = False boralheading = False binsertedheading = False if re.search('-- lost heading --(?i)', headingtxt): binsertedheading = True # Oral question are really a major heading elif re.match("Oral Answers to Questions(?i)", headingtxt): boralheading = True # Check if there are any other spellings of "Oral Answers to Questions" with a loose match elif re.search('oral(?i)', headingtxt) and re.search('ques(?i)', headingtxt) and (not re.search(" Not ", headingtxt)) and \ (not re.search("electoral", headingtxt)) and \ stampurl.sdate not in ("2002-06-11", "2012-02-09"): # have a genuine title with Oral in it print headingtxt raise ContextException('Oral question match not precise enough', stamp=stampurl, fragment=headingtxt) # All upper case headings - UGH elif not re.search('[a-z]', headingtxt) and not re.match('[A-Z\d/]+[\d/][A-Z\d/]+$', headingtxt) and not \ ('remaining_private_bills' in state and re.search(' Bill$(?i)', headingtxt)): bmajorheading = True elif 'just_had_points_of_order' in state: bmajorheading = True del state['just_had_points_of_order'] # If this is labeled major, then it gets concatenated with the # subsequent major heading. It's kind of a procedural info about the # running of things, so fair to have it as a minor heading alone. elif re.match("\[.*? in\s*the\s*Chair\.?\]$(?i)", headingtxt): bmajorheading = False elif re.search("in\s*the\s*chair(?i)", headingtxt): print headingtxt raise ContextException('in the chair match not precise enough', stamp=stampurl, fragment=headingtxt) # Other major headings, marked by _head in their anchor tag elif re.search('"topichd_|"ordayhd_|"hd_|_head', stampurl.aname): bmajorheading = True # Wah if stampurl.sdate > '2006-05-07': if re.match( "(Private business|Business of the House|Orders of the day|Opposition Day|Deferred Division|Petition)(?i)", headingtxt): bmajorheading = True if re.match("Points? of Order(?i)", headingtxt): bmajorheading = True state['just_had_points_of_order'] = True if re.match("Remaining Private Members[^ ]* Bills(?i)", headingtxt): bmajorheading = True state['remaining_private_bills'] = True # we're not writing a block for division headings # write out block for headings headingtxtfx = FixHTMLEntities(headingtxt) try: assert not re.search( "[<>]", headingtxtfx), headingtxtfx # an assertion in gidmatching except AssertionError: raise ContextException('Tag found in heading text', stamp=stampurl, fragment=headingtxt) qb = qspeech('nospeaker="true"', headingtxtfx, stampurl) if typ == 'westminhall': qb.typ = 'minor-heading' elif binsertedheading: qb.typ = 'inserted-heading' elif boralheading: qb.typ = 'oral-heading' elif bmajorheading: qb.typ = 'major-heading' else: qb.typ = 'minor-heading' # headings become one unmarked paragraph of text qb.stext = [headingtxtfx] return qb
def GrabDivisionProced(qbp, qbd): if qbp.typ != "speech" or len(qbp.stext) < 1: # this is that crazy correction one if qbp.sstampurl.sdate == "2003-12-18": return None print qbp.stext raise Exception, "previous to division not speech" qbp.stext[-1] = re.sub(" </i><i> ", " ", qbp.stext[-1]) qbp.stext[-1] = re.sub("</i><i> ", " ", qbp.stext[-1]) hdg = rehousediv.match(qbp.stext[-1]) if not hdg: hdg_a = rehousediv_a.match(qbp.stext[-2]) hdg_b = rehousediv_b.match(qbp.stext[-1]) if hdg_a and hdg_b: hdg = hdg_b elif hdg_b: # They are occasionally putting "The" "Committee" # "divided" in two or three separate paragraphs two_prev = re.sub("</p><p[^>]*>", "", "".join(qbp.stext[-3:-1])) three_prev = re.sub("</p><p[^>]*>", "", "".join(qbp.stext[-4:-1])) if rehousediv_a.match(three_prev): qbp.stext = qbp.stext[:-4] + [three_prev, qbp.stext[-1]] hdg = hdg_b elif rehousediv_a.match(two_prev): qbp.stext = qbp.stext[:-3] + [two_prev, qbp.stext[-1]] hdg = hdg_b if not hdg: hdg = redivshouldappear.match(qbp.stext[-1]) if not hdg: # another correction one if qbp.sstampurl.sdate != "2003-09-16": raise ContextException, "no house divided before division: %s" % qbp.stext[-1] return None # if previous thing is already a no-speaker, we don't need to break it out # (the coding on the question put is complex and multilined) if re.search('nospeaker="true"', qbp.speaker): qbp.stext = SubsPWtextset(qbp.stext) return None # look back at previous paragraphs and skim off a part of what's there # to make a non-spoken bit reporting on the division. iskim = 1 if re.search("Serjeant at Arms", qbp.stext[-2]): pass else: while len(qbp.stext) >= iskim: if reqput.match(qbp.stext[-iskim]): break iskim += 1 # haven't found a question put before we reach the front if len(qbp.stext) < iskim: iskim = 1 # VALID in 99% of cases: raise Exception, "no question put before division" # copy the two lines into a non-speaking paragraph. qbdp = qspeech('nospeaker="true"', "", qbp.sstampurl) qbdp.typ = "speech" qbdp.stext = SubsPWtextset(qbp.stext[-iskim:]) # trim back the given one by two lines qbp.stext = qbp.stext[:-iskim] return qbdp
def FilterLordsSpeech(qb): # pull in the normal filtering that gets done on debate speeches # does the paragraph indents and tables. Maybe should be inlined for lords FilterDebateSpeech(qb) # the colon attr is blank or has a : depending on what was there after the name that was matched ispeechstartp1 = 0 # plus 1 # no colonattr or colon, must be making a speech recol = re.search('colon="(:?)"', qb.speaker) bSpeakerExists = not re.match('nospeaker="true"', qb.speaker) if bSpeakerExists and (not recol or recol.group(1)): # text of this kind at the beginning should not be spoken, assume there wasn't a colon if not re.search("<p>(?:moved|asked|rose to move,) (?i)", qb.stext[0]) or re.search("<p>moved formally(?i)", qb.stext[0]): ispeechstartp1 = 1 # 0th paragraph is speech text res = [] # output list preparagraphtype = "" if bSpeakerExists and (ispeechstartp1 == 0): if re.match( "<p>asked Her Majesty's Government|<p>asked the|<p>—Took the Oath", qb.stext[0]): preparagraphtype = "asked" ispeechstartp1 = SearchForNobleLordSaid(qb, preparagraphtype) if ispeechstartp1 == len( qb.stext): # No Noble Lord said, the usual ispeechstartp1 = 1 if ispeechstartp1 != 1: print "Noble Lord Said on ", ispeechstartp1, "paragraph" raise ContextException( "Noble Lord Said missing in second paragraph", stamp=qb.sstampurl) # ensure that the noble lord said doesn't say an amendment withdrawn assert not MatchPWmotionStuff(qb, ispeechstartp1) elif re.match("<p>rose to (?:ask|call|draw attention|consider)", qb.stext[0]): preparagraphtype = "asked" ispeechstartp1 = SearchForNobleLordSaid(qb, preparagraphtype) if ispeechstartp1 not in [1, 2]: print "Noble Lord Said on ", ispeechstartp1, "paragraph" raise ContextException( "Noble Lord Said missing in second paragraph", stamp=qb.sstampurl) # ensure that the noble lord said doesn't say an amendment withdrawn assert not MatchPWmotionStuff(qb, ispeechstartp1) # identify a writ of summons (single line) elif re.match( "<p>(?:[\s,]*having received a [Ww]rit of [Ss]ummons .*?)?[Tt]ook the [Oo]ath\.</p>$", qb.stext[0]): assert len(qb.stext) == 1 qb.stext[0] = re.sub( '^<p>', '<p pwmotiontext="summons">', qb.stext[0] ) # cludgy; already have the <p>-tag embedded in the string res.append(qb) return res # bail out elif re.search( "having been created.*?Was, in (his|her) robes, introduced", qb.stext[0]): assert len(qb.stext) == 1 qbunspo = qspeech('nospeaker="true"', "", qb.sstampurl) qbunspo.typ = 'speech' qbunspo.stext = qb.stext qbunspo.stext[0] = re.sub('^<p>', '<p pwmotiontext="introduced">', qbunspo.stext[0]) res.append(qbunspo) return res elif re.match("<p>—Took the Oath", qb.stext[0]): assert False # identify a moved amendment elif re.match( "<p>moved,? |<p>Amendments? |<p>had given notice|<p>(?:rose )?to move|<p>had given his intention", qb.stext[0]): # find where the speech begins, and strip out "The noble lord said:" preparagraphtype = "moved" ispeechstartp1 = SearchForNobleLordSaid(qb, preparagraphtype) # everything up to this point is non-speech assert ispeechstartp1 > 0 qbprev = qspeech(qb.speaker, "", qb.sstampurl) qbprev.typ = 'speech' qbprev.stext = qb.stext[:ispeechstartp1] res.append(qbprev) if ispeechstartp1 == len(qb.stext): return res # upgrade the spoken part qb.speaker = string.replace(qb.speaker, 'colon=""', 'colon=":"') del qb.stext[:ispeechstartp1] assert qb.stext ispeechstartp1 = 1 # the spoken text must reach at least here (after the line, "The noble lord said:") # error, no moved amendment found else: print qb.stext print "no moved amendment; is a colon missing after the name?" raise ContextException("missing moved amendment", stamp=qb.sstampurl) # advance to place where non-speeches happen if ispeechstartp1 > len(qb.stext): print "ispeechstartp1 problem; speeches running through", ispeechstartp1, len( qb.stext) print qb.stext raise ContextException( "end of speech boundary unclear running through; need to separate paragraphs?", stamp=qb.sstampurl) # a common end of speech is to withdraw an amendment # we go through paragraphs until we match that or some other motion text type statement sAmendmentStatement = None while bSpeakerExists and (ispeechstartp1 < len(qb.stext)): sAmendmentStatement = MatchPWmotionStuff(qb, ispeechstartp1) if sAmendmentStatement: break ispeechstartp1 += 1 # there are no further lines after the widthdrawal if ispeechstartp1 == len(qb.stext): assert not sAmendmentStatement res.append(qb) return res # do the further lines after withdrawal assert (not bSpeakerExists) or sAmendmentStatement # splice off the unspoken text running off from the amendment statements if ispeechstartp1 != 0: qbunspo = qspeech('nospeaker="true"', "", qb.sstampurl) qbunspo.typ = 'speech' qbunspo.stext = qb.stext[ispeechstartp1:] del qb.stext[ispeechstartp1:] res.append(qb) res.append(qbunspo) else: res.append(qb) qbunspo = qb # check that once we begin pwmotion amendment statements, all statements are of this type for i in range(len(qbunspo.stext)): if not re.match('<p', qbunspo.stext[i]): continue sAmendmentStatement = MatchKnownAsPWmotionStuff(qbunspo, i) if not sAmendmentStatement: if IsNotQuiet(): print "UNRECOGNIZED-MOTION-TEXT%s: %s" % ( bSpeakerExists and " " or "(*)", qbunspo.stext[i]) sAmendmentStatement = "unrecognized" qbunspo.stext[i] = re.sub( '^<p(.*?)>', '<p\\1 pwmotiontext="%s">' % sAmendmentStatement, qbunspo.stext[i]) return res
def FilterWransSections(text, sdate, lords=False): text = ApplyFixSubstitutions(text, sdate, fixsubs) headspeak = SplitHeadingsSpeakers(text) # break down into lists of headings and lists of speeches (ih, stampurl) = StripWransHeadings(headspeak, sdate) # full list of question batches # We create a list of lists of speeches flatb = [ ] justhadnewtitle = False # For when they put another "Written Answers to Questions" and date for sht in headspeak[ih:]: # triplet of ( heading, unspokentext, [(speaker, text)] ) headingtxt = stampurl.UpdateStampUrl(string.strip(sht[0])) # we're getting stamps inside the headings sometimes unspoketxt = sht[1] speechestxt = sht[2] # update the stamps from the pre-spoken text if (not re.match('(?:<[^>]*>|\s)*$', unspoketxt)): raise ContextException("unspoken text under heading in wrans", stamp=stampurl, fragment=unspoketxt) stampurl.UpdateStampUrl(unspoketxt) # headings become one unmarked paragraph of text # detect if this is a major heading if not re.search('[a-z]', headingtxt) and not speechestxt: if not parlPhrases.wransmajorheadings.has_key(headingtxt): raise ContextException("unrecognized major heading, please add to parlPhrases.wransmajorheadings (a)", fragment = headingtxt, stamp = stampurl) majheadingtxtfx = parlPhrases.wransmajorheadings[headingtxt] # no need to fix since text is from a map. qbH = qspeech('nospeaker="true"', majheadingtxtfx, stampurl) qbH.typ = 'major-heading' qbH.stext = [ majheadingtxtfx ] flatb.append(qbH) continue elif not speechestxt and sdate > '2006-05-07': if headingtxt == 'Written Answers to Questions': justhadnewtitle = True continue if not parlPhrases.wransmajorheadings.has_key(headingtxt.upper()): if justhadnewtitle: justhadnewtitle = False continue raise ContextException("unrecognized major heading, please add to parlPhrases.wransmajorheadings (b)", fragment = headingtxt, stamp = stampurl) majheadingtxtfx = parlPhrases.wransmajorheadings[headingtxt.upper()] # no need to fix since text is from a map. qbH = qspeech('nospeaker="true"', majheadingtxtfx, stampurl) qbH.typ = 'major-heading' qbH.stext = [ majheadingtxtfx ] flatb.append(qbH) justhadnewtitle = False continue elif not speechestxt: raise ContextException('broken heading %s' % headingtxt, stamp=stampurl, fragment=headingtxt) # non-major heading; to a question batch if parlPhrases.wransmajorheadings.has_key(headingtxt): raise Exception, ' speeches found in major heading %s' % headingtxt headingtxtfx = FixHTMLEntities(headingtxt) headingmark = 'nospeaker="true"' bNextStartofQ = True # go through each of the speeches in a block and put it into our batch of speeches qnums = [] # used to account for spurious qnums seen in answers for ss in speechestxt: qb = qspeech(ss[0], ss[1], stampurl) #print ss[0] + " " + stampurl.stamp lqnums = re.findall('\[(?:HL)?(\d+)R?\]', ss[1]) # question posed if re.match('(?:<[^>]*?>|\s)*?(to ask|asked (Her Majesty('|’|\')s Government|the ))(?i)', qb.text) or \ re.search('<wrans-question>', qb.text): qb.text = qb.text.replace('<wrans-question>', '') qb.typ = 'ques' # put out the heading for this question-reply block. # we don't assert true since we can have multiple questions answsered in a block. if bNextStartofQ: # put out a heading # we need to make the heading of from the same stampurl as the first question qbh = qspeech(headingmark, headingtxtfx, qb.sstampurl) qbh.typ = 'minor-heading' qbh.stext = [ headingtxtfx ] flatb.append(qbh) bNextStartofQ = False # used to show that the subsequent headings in this block have been created, # and weren't in the original text. headingmark = 'nospeaker="true" inserted-heading="true"' qnums = lqnums # reset the qnums count else: qnums.extend(lqnums) qb.stext = FilterQuestion(qb, sdate, lords) if not lqnums: errmess = ' <p class="error">Question number missing in Hansard, possibly truncated question.</p> ' qb.stext.append(errmess) flatb.append(qb) # do the reply else: if bNextStartofQ: raise ContextException('start of question expected', stamp = qb.sstampurl, fragment = qb.text) qb.typ = 'reply' # this case is so rare we flag them in the corrections of the html with this tag if re.search("\<another-answer-to-follow\>", qb.text): qb.text = qb.text.replace("<another-answer-to-follow>", "") else: bNextStartofQ = True # check against qnums which are sometimes repeated in the answer code # Don't care if qnum is given in an answer! #for qn in lqnums: # # sometimes [n] is an enumeration or part of a title # nqn = string.atoi(qn) # if (not qnums.count(qn)) and (nqn > 100) and ((nqn < 1900) or (nqn > 2010)): # if qb.text.find("<ok-extra-qnum>") >= 0: # qb.text = qb.text.replace("<ok-extra-qnum>", "", 1) # else: # raise ContextException('unknown qnum %s present in answer, make it clear' % qn, stamp = qb.sstampurl, fragment = qb.text) qb.stext = FilterReply(qb) flatb.append(qb) if not bNextStartofQ: print speechestxt # Note - not sure if this should be speechestxt[-1][1] here. Does what I want for now... raise ContextException("missing answer to question", stamp=stampurl, fragment=speechestxt[-1][1]) # we now have everything flattened out in a series of speeches, # where some of the speeches are headings (inserted and otherwise). return flatb