def new_italic_speech(self, ptext, phtml): match = re.search('\(((?:Mr|Madam) Speaker)', ptext) if not match: match = re.search('\(Mr (?:Principal )?Deputy Speaker \[(.*?)\]', ptext) if match: #print "Setting deputy to %s" % match.group(1) memberList.setDeputy(match.group(1)) self.text += '<p class="italic">%s</p>\n' % phtml
def parse_day(self, fp, text, date): self.date = date # Special case for 2002-10-08 if re.search('i$', date): self.idA = 9 self.idB = 17 else: self.idA = 0 self.idB = 0 soup = NISoup(text, markupMassage=NISoup.myMassage) self.out = fp self.out = streamWriter(self.out) self.out.write('<?xml version="1.0" encoding="utf-8"?>\n') self.out.write(''' <!DOCTYPE publicwhip [ <!ENTITY pound "£"> <!ENTITY euro "€"> <!ENTITY agrave "à"> <!ENTITY aacute "á"> <!ENTITY acirc "â"> <!ENTITY ccedil "ç"> <!ENTITY egrave "è"> <!ENTITY eacute "é"> <!ENTITY ecirc "ê"> <!ENTITY iacute "í"> <!ENTITY ograve "ò"> <!ENTITY oacute "ó"> <!ENTITY uacute "ú"> <!ENTITY Aacute "Á"> <!ENTITY Eacute "É"> <!ENTITY Iacute "Í"> <!ENTITY Oacute "Ó"> <!ENTITY Uacute "Ú"> <!ENTITY Uuml "Ü"> <!ENTITY auml "ä"> <!ENTITY euml "ë"> <!ENTITY iuml "ï"> <!ENTITY ntilde "ñ"> <!ENTITY ouml "ö"> <!ENTITY uuml "ü"> <!ENTITY fnof "ƒ"> <!ENTITY nbsp " "> <!ENTITY shy "­"> <!ENTITY deg "°"> <!ENTITY sup2 "²"> <!ENTITY middot "·"> <!ENTITY ordm "º"> <!ENTITY frac14 "¼"> <!ENTITY frac12 "½"> <!ENTITY frac34 "¾"> <!ENTITY ndash "–"> <!ENTITY mdash "—"> <!ENTITY lsquo "‘"> <!ENTITY rsquo "’"> <!ENTITY ldquo "“"> <!ENTITY rdquo "”"> <!ENTITY hellip "…"> <!ENTITY bull "•"> ]> <publicwhip> ''') memberList.cleardebatehistory() # Don't want to keep it between days, or reruns of same day memberList.setDeputy(None) if date >= '2014-09-07': self.parse_day_new_new(soup, date) elif date >= '2012-04-30' and not soup('p', { 'class': True } ): self.parse_day_new_new(soup, date) elif int(date[0:4]) >= 2006: self.parse_day_new(soup, date) else: body = soup('p') self.parse_day_old(body) self.out.write('</publicwhip>\n')
def parse_day_old(self, body): match = re.match('\d\d(\d\d)-(\d\d)-(\d\d)(i?)$', self.date) urldate = '%s%s%s%s' % match.groups() self.baseurl = 'http://www.niassembly.gov.uk/record/reports/%s.htm' % urldate self.url = self.baseurl # Heading check if not re.match('Northern\s+Ireland\s+Assembly', body[0].find(text=True)): raise Exception, 'Missing NIA heading!' date_head = body[1].find(text=True) if not re.match('Contents', body[2].find(text=True)): raise Exception, 'Missing contents heading!' body = body[3:] timestamp = '' in_oral_answers = False oral_qn = 0 self.speaker = (None, timestamp) self.text = '' for p in body: if not p(text=True): continue ptext = re.sub("\s+", " ", ''.join(p(text=True))) phtml = re.sub("\s+", " ", p.renderContents()).decode('utf-8') #print phtml if (p.a and p.a.get('href', ' ')[0] == '#') or (p.a and re.match('\d', p.a.get('href', ''))) or ptext==' ': continue if p.findParent('i'): match = re.match('(\d\d?)\.(\d\d) (a|p)m', ptext) if match: hour = int(match.group(1)) if hour<12 and match.group(3) == 'p': hour += 12 timestamp = "%s:%s" % (hour, match.group(2)) continue #if self.speaker[0]: # display_speech() # self.speaker = (None, timestamp) match = re.search('(?:\(|\[)(?:Mr|Madam) Deputy Speaker (?:\[|\()(.*?)(?:\]|\))', phtml) if match: #print "Setting deputy to %s" % match.group(1) memberList.setDeputy(match.group(1)) match = re.match('The Assembly met at (\d\d\.\d\d|noon)', phtml) if match: if match.group(1) == 'noon': timestamp = '12:00' else: timestamp = match.group(1) self.speaker = (self.speaker[0], timestamp) self.text += '<p class="italic">%s</p>\n' % phtml continue if p.findParent('font', size=1): self.text += '<p class="small">%s</p>\n' % phtml continue if (p.get('align', '') == 'center' and (p.b or p.parent.name == 'b')) or (p.parent.name == 'b' and re.search('Stage$', ptext)): self.display_speech() self.speaker = (None, timestamp) aname = p.a and p.a.get('name', '') if ptext == 'Oral Answers': self.out.write('<oral-heading>\n') in_oral_answers = True if aname and re.match('#?\d+$', aname): self.idA = int(re.match('#?(\d+)$', aname).group(1)) self.idB = 0 self.url = '%s#%s' % (self.baseurl, aname) elif aname and re.match('#?\d+$', aname): if in_oral_answers: self.out.write('</oral-heading>\n') in_oral_answers = False self.idA = int(re.match('#?(\d+)$', aname).group(1)) self.idB = 0 self.url = '%s#%s' % (self.baseurl, aname) self.display_heading(ptext, timestamp, 'major') elif aname: self.idB += 1 self.display_heading(ptext, timestamp, 'major') else: self.idB += 1 self.display_heading(ptext, timestamp, 'minor') continue elif p.b or p.parent.name == 'b': if p.b: new_speaker = p.b.find(text=True) else: new_speaker = ptext if not re.match('\s*$', new_speaker): self.display_speech() speaker = re.sub("\s+", " ", new_speaker).strip() speaker = re.sub(':', '', speaker) id, str = memberList.match(speaker, self.date) self.speaker = (str, timestamp) if p.b and p.b.nextSibling: p.b.extract() phtml = re.sub("\s+", " ", p.renderContents()).decode('utf-8') self.text += "<p>%s</p>\n" % phtml continue match = re.match('(\d+)\.$', phtml) if match: oral_qn = match.group(1) continue if p.a and re.match('#\d+$', p.a.get('name', '')): raise ContextException, 'Uncaught title!' if re.match('Mr\w*(\s+\w)?\s+\w+:$', phtml): raise ContextException, 'Uncaught speaker! ' + phtml if oral_qn: phtml = "%s. %s" % (oral_qn, phtml) oral_qn = 0 self.text += "<p>%s</p>\n" % phtml self.display_speech() if in_oral_answers: self.out.write('</oral-heading>\n') in_oral_answers = False