def new_person_speak(self, p, timestamp): speaker = p.strong.find(text=True) speaker = re.sub(' ', '', speaker) speaker = re.sub("\s+", " ", speaker).strip() speaker = re.sub(':', '', speaker) id, stri = memberList.match(speaker, self.date) self.speaker = (stri, timestamp) p.strong.extract() phtml = p.renderContents() phtml = re.sub('^:\s*', '', phtml) phtml = re.sub("\s+", " ", phtml).decode('utf-8') self.text += "<p>%s</p>\n" % phtml
def parse_day(self, input): self.heading = {} self.pre_heading = {} self.speaker = {} self.text = '' timestamp = '' j = json.loads(input) if 'AllHansardComponentsList' in j: j = j['AllHansardComponentsList']['HansardComponent'] for line in j: text = (line['ComponentText'] or '').replace('&', '&') if not text: print "WARNING: Empty line: %s" % line elif line['ComponentType'] == 'Document Title': assert re.match( '(Plenary|PLE), %s/%s/%s$' % (self.date[8:10], self.date[5:7], self.date[0:4]), text) elif line['ComponentType'] == 'Time': timestamp = self.time_period(text) elif line['ComponentType'] == 'Header': if line['ComponentHeaderId'] in (0, 1, '0', '1'): typ = 'major' elif line['ComponentHeaderId'] in (2, '2'): typ = 'minor' else: raise Exception("Unknown ComponentHeaderId %s" % line['ComponentHeaderId']) if self.heading and self.heading['type'] == typ: self.pre_heading = { 'level': line['ComponentHeaderId'], 'text': self.heading['text'] } self.heading['text'] += ' — %s' % text else: self.display_speech() self.speaker = {'ts': timestamp} if self.pre_heading and self.pre_heading['level'] == line[ 'ComponentHeaderId']: text = '%s — %s' % (self.pre_heading['text'], text) elif self.pre_heading and self.pre_heading['level'] > line[ 'ComponentHeaderId']: self.pre_heading = {} self.heading = {'text': text, 'ts': timestamp, 'type': typ} elif re.match( 'Speaker \((MlaName|DeputyChairAndName|ChairAndName|DeputySpeaker|PrincipalDeputySpeaker|MinisterAndName|ActingSpeaker|TemporarySpeaker|Speaker)\)$', line['ComponentType']): # RelatedItemId here is the NI speaker ID. We could use that! # But for now, carry on going by name as all that code exists. self.display_speech() speaker = text.replace(':', '') id, stri = memberList.match(speaker, self.date) self.speaker = {'id': stri, 'ts': timestamp} elif line['ComponentType'] == 'Speaker (Special)' or line[ 'ComponentType'] == 'Speaker (GuestSpeaker)': self.display_speech() speaker = text.replace(':', '') self.speaker = {'name': speaker, 'ts': timestamp} elif line['ComponentType'] == 'Question': self.display_speech() m = re.match('(T?[0-9]+\. )?(.*?) asked', text) id, stri = memberList.match(m.group(2), self.date) self.speaker = {'id': stri, 'ts': timestamp} self.text += "<p>%s</p>\n" % text elif line['ComponentType'] == 'Quote': self.text += '<p class="indent">%s</p>\n' % text elif line['ComponentType'] in ('Plenary Item Text', 'Procedure Line'): match = re.match( 'The Assembly met at ((\d\d?):(\d\d?) (am|pm)|12 noon)', text) if match: timestamp = self.time_period(text) self.speaker['ts'] = timestamp self.text += '<p class="italic">%s</p>\n' % text elif line['ComponentType'] == 'Bill Text': self.text += text.replace( '<p>', '<p class="indent">') # Already is HTML elif line['ComponentType'] in ('Division', 'Spoken Text'): text = re.sub('\s*<BR />\s*<BR />\s*(?i)', '</p>\n<p>', text) text = re.sub('WIDTH=50%', 'WIDTH="50%"', text) self.text += '<p>%s</p>\n' % text else: raise ContextException("Uncaught Component Type! %s" % line['ComponentType']) self.display_speech()
def parse_day(self, soup): body = soup('p') match = re.match('\d\d(\d\d)-(\d\d)-(\d\d)(i?)$', self.date) urldate = '%s%s%s%s' % match.groups() self.baseurl = 'http://www.niassembly.gov.uk/record/reports/%s.htm' % urldate self.url = self.baseurl # Heading check if not re.match('Northern\s+Ireland\s+Assembly', body[0].find(text=True)): raise Exception, 'Missing NIA heading!' date_head = body[1].find(text=True) if not re.match('Contents', body[2].find(text=True)): raise Exception, 'Missing contents heading!' body = body[3:] timestamp = '' in_oral_answers = False oral_qn = 0 self.speaker = (None, timestamp) self.text = '' for p in body: if not p(text=True): continue ptext = re.sub("\s+", " ", ''.join(p(text=True))) phtml = re.sub("\s+", " ", p.renderContents()).decode('utf-8') #print phtml if (p.a and p.a.get('href', ' ')[0] == '#') or (p.a and re.match( '\d', p.a.get('href', ''))) or ptext == ' ': continue if p.findParent('i'): ts = self.time_period(ptext, optional=True) if ts: timestamp = ts continue #if self.speaker[0]: # display_speech() # self.speaker = (None, timestamp) match = re.search( '(?:\(|\[)(?:Mr|Madam) Deputy Speaker (?:\[|\()(.*?)(?:\]|\))', phtml) if match: #print "Setting deputy to %s" % match.group(1) memberList.setDeputy(match.group(1)) match = re.match('The Assembly met at (\d\d\.\d\d|noon)', phtml) if match: if match.group(1) == 'noon': timestamp = '12:00' else: timestamp = match.group(1) self.speaker = (self.speaker[0], timestamp) self.text += '<p class="italic">%s</p>\n' % phtml continue if p.findParent('font', size=1): self.text += '<p class="small">%s</p>\n' % phtml continue if (p.get('align', '') == 'center' and (p.b or p.parent.name == 'b')) or (p.parent.name == 'b' and re.search('Stage$', ptext)): self.display_speech() self.speaker = (None, timestamp) aname = p.a and p.a.get('name', '') if ptext == 'Oral Answers': self.out.write('<oral-heading>\n') in_oral_answers = True if aname and re.match('#?\d+$', aname): self.idA = int(re.match('#?(\d+)$', aname).group(1)) self.idB = 0 self.url = '%s#%s' % (self.baseurl, aname) elif aname and re.match('#?\d+$', aname): if in_oral_answers: self.out.write('</oral-heading>\n') in_oral_answers = False self.idA = int(re.match('#?(\d+)$', aname).group(1)) self.idB = 0 self.url = '%s#%s' % (self.baseurl, aname) self.display_heading(ptext, timestamp, 'major') elif aname: self.idB += 1 self.display_heading(ptext, timestamp, 'major') else: self.idB += 1 self.display_heading(ptext, timestamp, 'minor') continue elif p.b or p.parent.name == 'b': if p.b: new_speaker = p.b.find(text=True) else: new_speaker = ptext if not re.match('\s*$', new_speaker): self.display_speech() speaker = re.sub("\s+", " ", new_speaker).strip() speaker = re.sub(':', '', speaker) id, str = memberList.match(speaker, self.date) self.speaker = (str, timestamp) if p.b and p.b.nextSibling: p.b.extract() phtml = re.sub("\s+", " ", p.renderContents()).decode('utf-8') self.text += "<p>%s</p>\n" % phtml continue match = re.match('(\d+)\.$', phtml) if match: oral_qn = match.group(1) continue if p.a and re.match('#\d+$', p.a.get('name', '')): raise ContextException, 'Uncaught title!' if re.match('Mr\w*(\s+\w)?\s+\w+:$', phtml): raise ContextException, 'Uncaught speaker! ' + phtml if oral_qn: phtml = "%s. %s" % (oral_qn, phtml) oral_qn = 0 self.text += "<p>%s</p>\n" % phtml self.display_speech() if in_oral_answers: self.out.write('</oral-heading>\n') in_oral_answers = False
def parse_day(self, input): self.heading = {} self.pre_heading = {} self.speaker = {} self.text = '' timestamp = '' for line in json.loads(input): text = line['ComponentText'].replace('&', '&') if not text: print "WARNING: Empty line: %s" % line elif line['ComponentType'] == 'Document Title': assert text == 'Plenary, %s/%s/%s' % (self.date[8:10], self.date[5:7], self.date[0:4]) elif line['ComponentType'] == 'Time': timestamp = self.time_period(text) elif line['ComponentType'] == 'Header': if line['ComponentHeaderId'] in (0, 1): typ = 'major' elif line['ComponentHeaderId'] == 2: typ = 'minor' else: raise Exception("Unknown ComponentHeaderId %s" % line['ComponentHeaderId']) if self.heading and self.heading['type'] == typ: self.pre_heading = {'level': line['ComponentHeaderId'], 'text': self.heading['text']} self.heading['text'] += ' — %s' % text else: self.display_speech() self.speaker = {'ts': timestamp} if self.pre_heading and self.pre_heading['level'] == line['ComponentHeaderId']: text = '%s — %s' % (self.pre_heading['text'], text) elif self.pre_heading and self.pre_heading['level'] > line['ComponentHeaderId']: self.pre_heading = {} self.heading = {'text': text, 'ts': timestamp, 'type': typ} elif re.match('Speaker \((MlaName|DeputyChairAndName|ChairAndName|DeputySpeaker|PrincipalDeputySpeaker|MinisterAndName|ActingSpeaker|TemporarySpeaker|Speaker)\)$', line['ComponentType']): # RelatedItemId here is the NI speaker ID. We could use that! # But for now, carry on going by name as all that code exists. self.display_speech() speaker = text.replace(':', '') id, stri = memberList.match(speaker, self.date) self.speaker = {'id': stri, 'ts': timestamp} elif line['ComponentType'] == 'Speaker (Special)' or line['ComponentType'] == 'Speaker (GuestSpeaker)': self.display_speech() speaker = text.replace(':', '') self.speaker = {'name': speaker, 'ts': timestamp} elif line['ComponentType'] == 'Question': self.display_speech() m = re.match('(T?[0-9]+\. )?(.*?) asked', text) id, stri = memberList.match(m.group(2), self.date) self.speaker = {'id': stri, 'ts': timestamp} self.text += "<p>%s</p>\n" % text elif line['ComponentType'] == 'Quote': self.text += '<p class="indent">%s</p>\n' % text elif line['ComponentType'] in ('Plenary Item Text', 'Procedure Line'): match = re.match('The Assembly met at ((\d\d?):(\d\d) (am|pm)|12 noon)', text) if match: timestamp = self.time_period(text) self.speaker['ts'] = timestamp self.text += '<p class="italic">%s</p>\n' % text elif line['ComponentType'] == 'Bill Text': self.text += text.replace('<p>', '<p class="indent">') # Already is HTML elif line['ComponentType'] in ('Division', 'Spoken Text'): text = re.sub('\s*<BR />\s*<BR />\s*(?i)', '</p>\n<p>', text) text = re.sub('WIDTH=50%', 'WIDTH="50%"', text) self.text += '<p>%s</p>\n' % text else: raise ContextException("Uncaught Component Type! %s" % line['ComponentType']) self.display_speech()
def parse_day(self, soup): body = soup('p') match = re.match('\d\d(\d\d)-(\d\d)-(\d\d)(i?)$', self.date) urldate = '%s%s%s%s' % match.groups() self.baseurl = 'http://www.niassembly.gov.uk/record/reports/%s.htm' % urldate self.url = self.baseurl # Heading check if not re.match('Northern\s+Ireland\s+Assembly', body[0].find(text=True)): raise Exception, 'Missing NIA heading!' date_head = body[1].find(text=True) if not re.match('Contents', body[2].find(text=True)): raise Exception, 'Missing contents heading!' body = body[3:] timestamp = '' in_oral_answers = False oral_qn = 0 self.speaker = (None, timestamp) self.text = '' for p in body: if not p(text=True): continue ptext = re.sub("\s+", " ", ''.join(p(text=True))) phtml = re.sub("\s+", " ", p.renderContents()).decode('utf-8') #print phtml if (p.a and p.a.get('href', ' ')[0] == '#') or (p.a and re.match('\d', p.a.get('href', ''))) or ptext==' ': continue if p.findParent('i'): ts = self.time_period(ptext, optional=True) if ts: timestamp = ts continue #if self.speaker[0]: # display_speech() # self.speaker = (None, timestamp) match = re.search('(?:\(|\[)(?:Mr|Madam) Deputy Speaker (?:\[|\()(.*?)(?:\]|\))', phtml) if match: #print "Setting deputy to %s" % match.group(1) memberList.setDeputy(match.group(1)) match = re.match('The Assembly met at (\d\d\.\d\d|noon)', phtml) if match: if match.group(1) == 'noon': timestamp = '12:00' else: timestamp = match.group(1) self.speaker = (self.speaker[0], timestamp) self.text += '<p class="italic">%s</p>\n' % phtml continue if p.findParent('font', size=1): self.text += '<p class="small">%s</p>\n' % phtml continue if (p.get('align', '') == 'center' and (p.b or p.parent.name == 'b')) or (p.parent.name == 'b' and re.search('Stage$', ptext)): self.display_speech() self.speaker = (None, timestamp) aname = p.a and p.a.get('name', '') if ptext == 'Oral Answers': self.out.write('<oral-heading>\n') in_oral_answers = True if aname and re.match('#?\d+$', aname): self.idA = int(re.match('#?(\d+)$', aname).group(1)) self.idB = 0 self.url = '%s#%s' % (self.baseurl, aname) elif aname and re.match('#?\d+$', aname): if in_oral_answers: self.out.write('</oral-heading>\n') in_oral_answers = False self.idA = int(re.match('#?(\d+)$', aname).group(1)) self.idB = 0 self.url = '%s#%s' % (self.baseurl, aname) self.display_heading(ptext, timestamp, 'major') elif aname: self.idB += 1 self.display_heading(ptext, timestamp, 'major') else: self.idB += 1 self.display_heading(ptext, timestamp, 'minor') continue elif p.b or p.parent.name == 'b': if p.b: new_speaker = p.b.find(text=True) else: new_speaker = ptext if not re.match('\s*$', new_speaker): self.display_speech() speaker = re.sub("\s+", " ", new_speaker).strip() speaker = re.sub(':', '', speaker) id, str = memberList.match(speaker, self.date) self.speaker = (str, timestamp) if p.b and p.b.nextSibling: p.b.extract() phtml = re.sub("\s+", " ", p.renderContents()).decode('utf-8') self.text += "<p>%s</p>\n" % phtml continue match = re.match('(\d+)\.$', phtml) if match: oral_qn = match.group(1) continue if p.a and re.match('#\d+$', p.a.get('name', '')): raise ContextException, 'Uncaught title!' if re.match('Mr\w*(\s+\w)?\s+\w+:$', phtml): raise ContextException, 'Uncaught speaker! ' + phtml if oral_qn: phtml = "%s. %s" % (oral_qn, phtml) oral_qn = 0 self.text += "<p>%s</p>\n" % phtml self.display_speech() if in_oral_answers: self.out.write('</oral-heading>\n') in_oral_answers = False