Example #1
0
 def new_person_speak(self, p, timestamp):
     speaker = p.strong.find(text=True)
     speaker = re.sub(' ', '', speaker)
     speaker = re.sub("\s+", " ", speaker).strip()
     speaker = re.sub(':', '', speaker)
     id, stri = memberList.match(speaker, self.date)
     self.speaker = (stri, timestamp)
     p.strong.extract()
     phtml = p.renderContents()
     phtml = re.sub('^:\s*', '', phtml)
     phtml = re.sub("\s+", " ", phtml).decode('utf-8')
     self.text += "<p>%s</p>\n" % phtml
Example #2
0
	def new_person_speak(self, p, timestamp):
		speaker = p.strong.find(text=True)
		speaker = re.sub('&nbsp;', '', speaker)
		speaker = re.sub("\s+", " ", speaker).strip()
		speaker = re.sub(':', '', speaker)
		id, stri = memberList.match(speaker, self.date)
		self.speaker = (stri, timestamp)
		p.strong.extract()
		phtml = p.renderContents()
		phtml = re.sub('^:\s*', '', phtml)
		phtml = re.sub("\s+", " ", phtml).decode('utf-8')
		self.text += "<p>%s</p>\n" % phtml
Example #3
0
 def parse_day(self, input):
     self.heading = {}
     self.pre_heading = {}
     self.speaker = {}
     self.text = ''
     timestamp = ''
     j = json.loads(input)
     if 'AllHansardComponentsList' in j:
         j = j['AllHansardComponentsList']['HansardComponent']
     for line in j:
         text = (line['ComponentText'] or '').replace('&', '&amp;')
         if not text:
             print "WARNING: Empty line: %s" % line
         elif line['ComponentType'] == 'Document Title':
             assert re.match(
                 '(Plenary|PLE), %s/%s/%s$' %
                 (self.date[8:10], self.date[5:7], self.date[0:4]), text)
         elif line['ComponentType'] == 'Time':
             timestamp = self.time_period(text)
         elif line['ComponentType'] == 'Header':
             if line['ComponentHeaderId'] in (0, 1, '0', '1'):
                 typ = 'major'
             elif line['ComponentHeaderId'] in (2, '2'):
                 typ = 'minor'
             else:
                 raise Exception("Unknown ComponentHeaderId %s" %
                                 line['ComponentHeaderId'])
             if self.heading and self.heading['type'] == typ:
                 self.pre_heading = {
                     'level': line['ComponentHeaderId'],
                     'text': self.heading['text']
                 }
                 self.heading['text'] += ' &#8212; %s' % text
             else:
                 self.display_speech()
                 self.speaker = {'ts': timestamp}
                 if self.pre_heading and self.pre_heading['level'] == line[
                         'ComponentHeaderId']:
                     text = '%s &#8212; %s' % (self.pre_heading['text'],
                                               text)
                 elif self.pre_heading and self.pre_heading['level'] > line[
                         'ComponentHeaderId']:
                     self.pre_heading = {}
                 self.heading = {'text': text, 'ts': timestamp, 'type': typ}
         elif re.match(
                 'Speaker \((MlaName|DeputyChairAndName|ChairAndName|DeputySpeaker|PrincipalDeputySpeaker|MinisterAndName|ActingSpeaker|TemporarySpeaker|Speaker)\)$',
                 line['ComponentType']):
             # RelatedItemId here is the NI speaker ID. We could use that!
             # But for now, carry on going by name as all that code exists.
             self.display_speech()
             speaker = text.replace(':', '')
             id, stri = memberList.match(speaker, self.date)
             self.speaker = {'id': stri, 'ts': timestamp}
         elif line['ComponentType'] == 'Speaker (Special)' or line[
                 'ComponentType'] == 'Speaker (GuestSpeaker)':
             self.display_speech()
             speaker = text.replace(':', '')
             self.speaker = {'name': speaker, 'ts': timestamp}
         elif line['ComponentType'] == 'Question':
             self.display_speech()
             m = re.match('(T?[0-9]+\. )?(.*?) asked', text)
             id, stri = memberList.match(m.group(2), self.date)
             self.speaker = {'id': stri, 'ts': timestamp}
             self.text += "<p>%s</p>\n" % text
         elif line['ComponentType'] == 'Quote':
             self.text += '<p class="indent">%s</p>\n' % text
         elif line['ComponentType'] in ('Plenary Item Text',
                                        'Procedure Line'):
             match = re.match(
                 'The Assembly met at ((\d\d?):(\d\d?) (am|pm)|12 noon)',
                 text)
             if match:
                 timestamp = self.time_period(text)
                 self.speaker['ts'] = timestamp
             self.text += '<p class="italic">%s</p>\n' % text
         elif line['ComponentType'] == 'Bill Text':
             self.text += text.replace(
                 '<p>', '<p class="indent">')  # Already is HTML
         elif line['ComponentType'] in ('Division', 'Spoken Text'):
             text = re.sub('\s*<BR />\s*<BR />\s*(?i)', '</p>\n<p>', text)
             text = re.sub('WIDTH=50%', 'WIDTH="50%"', text)
             self.text += '<p>%s</p>\n' % text
         else:
             raise ContextException("Uncaught Component Type! %s" %
                                    line['ComponentType'])
     self.display_speech()
Example #4
0
    def parse_day(self, soup):
        body = soup('p')
        match = re.match('\d\d(\d\d)-(\d\d)-(\d\d)(i?)$', self.date)
        urldate = '%s%s%s%s' % match.groups()
        self.baseurl = 'http://www.niassembly.gov.uk/record/reports/%s.htm' % urldate
        self.url = self.baseurl

        # Heading check
        if not re.match('Northern\s+Ireland\s+Assembly',
                        body[0].find(text=True)):
            raise Exception, 'Missing NIA heading!'
        date_head = body[1].find(text=True)
        if not re.match('Contents', body[2].find(text=True)):
            raise Exception, 'Missing contents heading!'
        body = body[3:]

        timestamp = ''
        in_oral_answers = False
        oral_qn = 0
        self.speaker = (None, timestamp)
        self.text = ''
        for p in body:
            if not p(text=True): continue
            ptext = re.sub("\s+", " ", ''.join(p(text=True)))
            phtml = re.sub("\s+", " ", p.renderContents()).decode('utf-8')
            #print phtml
            if (p.a and p.a.get('href', ' ')[0] == '#') or (p.a and re.match(
                    '\d', p.a.get('href', ''))) or ptext == '&nbsp;':
                continue
            if p.findParent('i'):
                ts = self.time_period(ptext, optional=True)
                if ts:
                    timestamp = ts
                    continue
                #if self.speaker[0]:
                #	display_speech()
                #	self.speaker = (None, timestamp)
                match = re.search(
                    '(?:\(|\[)(?:Mr|Madam) Deputy Speaker (?:\[|\()(.*?)(?:\]|\))',
                    phtml)
                if match:
                    #print "Setting deputy to %s" % match.group(1)
                    memberList.setDeputy(match.group(1))
                match = re.match('The Assembly met at (\d\d\.\d\d|noon)',
                                 phtml)
                if match:
                    if match.group(1) == 'noon':
                        timestamp = '12:00'
                    else:
                        timestamp = match.group(1)
                    self.speaker = (self.speaker[0], timestamp)
                self.text += '<p class="italic">%s</p>\n' % phtml
                continue
            if p.findParent('font', size=1):
                self.text += '<p class="small">%s</p>\n' % phtml
                continue
            if (p.get('align', '') == 'center' and
                (p.b or p.parent.name == 'b')) or (p.parent.name == 'b' and
                                                   re.search('Stage$', ptext)):
                self.display_speech()
                self.speaker = (None, timestamp)
                aname = p.a and p.a.get('name', '')
                if ptext == 'Oral Answers':
                    self.out.write('<oral-heading>\n')
                    in_oral_answers = True
                    if aname and re.match('#?\d+$', aname):
                        self.idA = int(re.match('#?(\d+)$', aname).group(1))
                        self.idB = 0
                        self.url = '%s#%s' % (self.baseurl, aname)
                elif aname and re.match('#?\d+$', aname):
                    if in_oral_answers:
                        self.out.write('</oral-heading>\n')
                        in_oral_answers = False
                    self.idA = int(re.match('#?(\d+)$', aname).group(1))
                    self.idB = 0
                    self.url = '%s#%s' % (self.baseurl, aname)
                    self.display_heading(ptext, timestamp, 'major')
                elif aname:
                    self.idB += 1
                    self.display_heading(ptext, timestamp, 'major')
                else:
                    self.idB += 1
                    self.display_heading(ptext, timestamp, 'minor')
                continue
            elif p.b or p.parent.name == 'b':
                if p.b:
                    new_speaker = p.b.find(text=True)
                else:
                    new_speaker = ptext
                if not re.match('\s*$', new_speaker):
                    self.display_speech()
                    speaker = re.sub("\s+", " ", new_speaker).strip()
                    speaker = re.sub(':', '', speaker)
                    id, str = memberList.match(speaker, self.date)
                    self.speaker = (str, timestamp)
                if p.b and p.b.nextSibling:
                    p.b.extract()
                    phtml = re.sub("\s+", " ",
                                   p.renderContents()).decode('utf-8')
                    self.text += "<p>%s</p>\n" % phtml
                continue
            match = re.match('(\d+)\.$', phtml)
            if match:
                oral_qn = match.group(1)
                continue
            if p.a and re.match('#\d+$', p.a.get('name', '')):
                raise ContextException, 'Uncaught title!'
            if re.match('Mr\w*(\s+\w)?\s+\w+:$', phtml):
                raise ContextException, 'Uncaught speaker! ' + phtml
            if oral_qn:
                phtml = "%s. %s" % (oral_qn, phtml)
                oral_qn = 0
            self.text += "<p>%s</p>\n" % phtml
        self.display_speech()
        if in_oral_answers:
            self.out.write('</oral-heading>\n')
            in_oral_answers = False
Example #5
0
 def parse_day(self, input):
     self.heading = {}
     self.pre_heading = {}
     self.speaker = {}
     self.text = ''
     timestamp = ''
     for line in json.loads(input):
         text = line['ComponentText'].replace('&', '&amp;')
         if not text:
             print "WARNING: Empty line: %s" % line
         elif line['ComponentType'] == 'Document Title':
             assert text == 'Plenary, %s/%s/%s' % (self.date[8:10], self.date[5:7], self.date[0:4])
         elif line['ComponentType'] == 'Time':
             timestamp = self.time_period(text)
         elif line['ComponentType'] == 'Header':
             if line['ComponentHeaderId'] in (0, 1):
                 typ = 'major'
             elif line['ComponentHeaderId'] == 2:
                 typ = 'minor'
             else:
                 raise Exception("Unknown ComponentHeaderId %s" % line['ComponentHeaderId'])
             if self.heading and self.heading['type'] == typ:
                 self.pre_heading = {'level': line['ComponentHeaderId'], 'text': self.heading['text']}
                 self.heading['text'] += ' &#8212; %s' % text
             else:
                 self.display_speech()
                 self.speaker = {'ts': timestamp}
                 if self.pre_heading and self.pre_heading['level'] == line['ComponentHeaderId']:
                     text = '%s &#8212; %s' % (self.pre_heading['text'], text)
                 elif self.pre_heading and self.pre_heading['level'] > line['ComponentHeaderId']:
                     self.pre_heading = {}
                 self.heading = {'text': text, 'ts': timestamp, 'type': typ}
         elif re.match('Speaker \((MlaName|DeputyChairAndName|ChairAndName|DeputySpeaker|PrincipalDeputySpeaker|MinisterAndName|ActingSpeaker|TemporarySpeaker|Speaker)\)$', line['ComponentType']):
             # RelatedItemId here is the NI speaker ID. We could use that!
             # But for now, carry on going by name as all that code exists.
             self.display_speech()
             speaker = text.replace(':', '')
             id, stri = memberList.match(speaker, self.date)
             self.speaker = {'id': stri, 'ts': timestamp}
         elif line['ComponentType'] == 'Speaker (Special)' or line['ComponentType'] == 'Speaker (GuestSpeaker)':
             self.display_speech()
             speaker = text.replace(':', '')
             self.speaker = {'name': speaker, 'ts': timestamp}
         elif line['ComponentType'] == 'Question':
             self.display_speech()
             m = re.match('(T?[0-9]+\. )?(.*?) asked', text)
             id, stri = memberList.match(m.group(2), self.date)
             self.speaker = {'id': stri, 'ts': timestamp}
             self.text += "<p>%s</p>\n" % text
         elif line['ComponentType'] == 'Quote':
             self.text += '<p class="indent">%s</p>\n' % text
         elif line['ComponentType'] in ('Plenary Item Text', 'Procedure Line'):
             match = re.match('The Assembly met at ((\d\d?):(\d\d) (am|pm)|12 noon)', text)
             if match:
                 timestamp = self.time_period(text)
                 self.speaker['ts'] = timestamp
             self.text += '<p class="italic">%s</p>\n' % text
         elif line['ComponentType'] == 'Bill Text':
             self.text += text.replace('<p>', '<p class="indent">')  # Already is HTML
         elif line['ComponentType'] in ('Division', 'Spoken Text'):
             text = re.sub('\s*<BR />\s*<BR />\s*(?i)', '</p>\n<p>', text)
             text = re.sub('WIDTH=50%', 'WIDTH="50%"', text)
             self.text += '<p>%s</p>\n' % text
         else:
             raise ContextException("Uncaught Component Type! %s" % line['ComponentType'])
     self.display_speech()
Example #6
0
	def parse_day(self, soup):
		body = soup('p')
		match = re.match('\d\d(\d\d)-(\d\d)-(\d\d)(i?)$', self.date)
		urldate = '%s%s%s%s' % match.groups()
		self.baseurl = 'http://www.niassembly.gov.uk/record/reports/%s.htm' % urldate
		self.url = self.baseurl

		# Heading check
		if not re.match('Northern\s+Ireland\s+Assembly', body[0].find(text=True)):
			raise Exception, 'Missing NIA heading!'
		date_head = body[1].find(text=True)
		if not re.match('Contents', body[2].find(text=True)):
			raise Exception, 'Missing contents heading!'
		body = body[3:]
	
		timestamp = ''
		in_oral_answers = False
		oral_qn = 0
		self.speaker = (None, timestamp)
		self.text = ''
		for p in body:
			if not p(text=True): continue
			ptext = re.sub("\s+", " ", ''.join(p(text=True)))
			phtml = re.sub("\s+", " ", p.renderContents()).decode('utf-8')
			#print phtml
			if (p.a and p.a.get('href', ' ')[0] == '#') or (p.a and re.match('\d', p.a.get('href', ''))) or ptext=='&nbsp;':
				continue
			if p.findParent('i'):
				ts = self.time_period(ptext, optional=True)
				if ts:
					timestamp = ts
					continue
				#if self.speaker[0]:
				#	display_speech()
				#	self.speaker = (None, timestamp)
				match = re.search('(?:\(|\[)(?:Mr|Madam) Deputy Speaker (?:\[|\()(.*?)(?:\]|\))', phtml)
				if match:
					#print "Setting deputy to %s" % match.group(1)
					memberList.setDeputy(match.group(1))
				match = re.match('The Assembly met at (\d\d\.\d\d|noon)', phtml)
				if match:
					if match.group(1) == 'noon':
						timestamp = '12:00'
					else:
						timestamp = match.group(1)
					self.speaker = (self.speaker[0], timestamp)
				self.text += '<p class="italic">%s</p>\n' % phtml
				continue
			if p.findParent('font', size=1):
				self.text += '<p class="small">%s</p>\n' % phtml
				continue
			if (p.get('align', '') == 'center' and (p.b or p.parent.name == 'b')) or (p.parent.name == 'b' and re.search('Stage$', ptext)):
				self.display_speech()
				self.speaker = (None, timestamp)
				aname = p.a and p.a.get('name', '')
				if ptext == 'Oral Answers':
					self.out.write('<oral-heading>\n')
					in_oral_answers = True
					if aname and re.match('#?\d+$', aname):
						self.idA = int(re.match('#?(\d+)$', aname).group(1))
						self.idB = 0
						self.url = '%s#%s' % (self.baseurl, aname)
				elif aname and re.match('#?\d+$', aname):
					if in_oral_answers:
						self.out.write('</oral-heading>\n')
						in_oral_answers = False
					self.idA = int(re.match('#?(\d+)$', aname).group(1))
					self.idB = 0
					self.url = '%s#%s' % (self.baseurl, aname)
					self.display_heading(ptext, timestamp, 'major')
				elif aname:
					self.idB += 1
					self.display_heading(ptext, timestamp, 'major')
				else:
					self.idB += 1
					self.display_heading(ptext, timestamp, 'minor')
				continue
			elif p.b or p.parent.name == 'b':
				if p.b:
					new_speaker = p.b.find(text=True)
				else:
					new_speaker = ptext
				if not re.match('\s*$', new_speaker):
					self.display_speech()
					speaker = re.sub("\s+", " ", new_speaker).strip()
					speaker = re.sub(':', '', speaker)
					id, str = memberList.match(speaker, self.date)
					self.speaker = (str, timestamp)
				if p.b and p.b.nextSibling:
					p.b.extract()
					phtml = re.sub("\s+", " ", p.renderContents()).decode('utf-8')
					self.text += "<p>%s</p>\n" % phtml
				continue
			match = re.match('(\d+)\.$', phtml)
			if match:
				oral_qn = match.group(1)
				continue
			if p.a and re.match('#\d+$', p.a.get('name', '')):
				raise ContextException, 'Uncaught title!'
			if re.match('Mr\w*(\s+\w)?\s+\w+:$', phtml):
				raise ContextException, 'Uncaught speaker! ' + phtml
			if oral_qn:
				phtml = "%s. %s" % (oral_qn, phtml)
				oral_qn = 0
			self.text += "<p>%s</p>\n" % phtml
		self.display_speech()
		if in_oral_answers:
			self.out.write('</oral-heading>\n')
			in_oral_answers = False