Ejemplo n.º 1
0
	def new_italic_speech(self, ptext, phtml):
		match = re.search('\(((?:Mr|Madam) Speaker)', ptext)
		if not match:
			match = re.search('\(Mr (?:Principal )?Deputy Speaker \[(.*?)\]', ptext)
		if match:
			#print "Setting deputy to %s" % match.group(1)
			memberList.setDeputy(match.group(1))
		self.text += '<p class="italic">%s</p>\n' % phtml
Ejemplo n.º 2
0
 def new_italic_speech(self, ptext, phtml):
     match = re.search('\(((?:Mr|Madam) Speaker)', ptext)
     if not match:
         match = re.search('\(Mr (?:Principal )?Deputy Speaker \[(.*?)\]',
                           ptext)
     if match:
         #print "Setting deputy to %s" % match.group(1)
         memberList.setDeputy(match.group(1))
     self.text += '<p class="italic">%s</p>\n' % phtml
Ejemplo n.º 3
0
 def parse_day(self, text):
     soup = NISoup(text, markupMassage=NISoup.myMassage)
     memberList.cleardebatehistory()  # Don't want to keep it between days, or reruns of same day
     memberList.setDeputy(None)
     if self.date >= '2014-09-07' or (self.date >= '2012-04-30' and not soup('p', {'class': True})):
         parser = ParseDayHTMLParserNew2012
     elif int(self.date[0:4]) >= 2006:
         parser = ParseDayHTMLParserNew2006
     else:
         parser = ParseDayHTMLParserOld
     parser(self.out, self.date).parse_day(soup)
Ejemplo n.º 4
0
 def parse_day(self, text):
     soup = NISoup(text, markupMassage=NISoup.myMassage)
     memberList.cleardebatehistory(
     )  # Don't want to keep it between days, or reruns of same day
     memberList.setDeputy(None)
     if self.date >= '2014-09-07' or (self.date >= '2012-04-30'
                                      and not soup('p', {'class': True})):
         parser = ParseDayHTMLParserNew2012
     elif int(self.date[0:4]) >= 2006:
         parser = ParseDayHTMLParserNew2006
     else:
         parser = ParseDayHTMLParserOld
     parser(self.out, self.date).parse_day(soup)
Ejemplo n.º 5
0
    def parse_day(self, soup):
        body = soup('p')
        match = re.match('\d\d(\d\d)-(\d\d)-(\d\d)(i?)$', self.date)
        urldate = '%s%s%s%s' % match.groups()
        self.baseurl = 'http://www.niassembly.gov.uk/record/reports/%s.htm' % urldate
        self.url = self.baseurl

        # Heading check
        if not re.match('Northern\s+Ireland\s+Assembly',
                        body[0].find(text=True)):
            raise Exception, 'Missing NIA heading!'
        date_head = body[1].find(text=True)
        if not re.match('Contents', body[2].find(text=True)):
            raise Exception, 'Missing contents heading!'
        body = body[3:]

        timestamp = ''
        in_oral_answers = False
        oral_qn = 0
        self.speaker = (None, timestamp)
        self.text = ''
        for p in body:
            if not p(text=True): continue
            ptext = re.sub("\s+", " ", ''.join(p(text=True)))
            phtml = re.sub("\s+", " ", p.renderContents()).decode('utf-8')
            #print phtml
            if (p.a and p.a.get('href', ' ')[0] == '#') or (p.a and re.match(
                    '\d', p.a.get('href', ''))) or ptext == '&nbsp;':
                continue
            if p.findParent('i'):
                ts = self.time_period(ptext, optional=True)
                if ts:
                    timestamp = ts
                    continue
                #if self.speaker[0]:
                #	display_speech()
                #	self.speaker = (None, timestamp)
                match = re.search(
                    '(?:\(|\[)(?:Mr|Madam) Deputy Speaker (?:\[|\()(.*?)(?:\]|\))',
                    phtml)
                if match:
                    #print "Setting deputy to %s" % match.group(1)
                    memberList.setDeputy(match.group(1))
                match = re.match('The Assembly met at (\d\d\.\d\d|noon)',
                                 phtml)
                if match:
                    if match.group(1) == 'noon':
                        timestamp = '12:00'
                    else:
                        timestamp = match.group(1)
                    self.speaker = (self.speaker[0], timestamp)
                self.text += '<p class="italic">%s</p>\n' % phtml
                continue
            if p.findParent('font', size=1):
                self.text += '<p class="small">%s</p>\n' % phtml
                continue
            if (p.get('align', '') == 'center' and
                (p.b or p.parent.name == 'b')) or (p.parent.name == 'b' and
                                                   re.search('Stage$', ptext)):
                self.display_speech()
                self.speaker = (None, timestamp)
                aname = p.a and p.a.get('name', '')
                if ptext == 'Oral Answers':
                    self.out.write('<oral-heading>\n')
                    in_oral_answers = True
                    if aname and re.match('#?\d+$', aname):
                        self.idA = int(re.match('#?(\d+)$', aname).group(1))
                        self.idB = 0
                        self.url = '%s#%s' % (self.baseurl, aname)
                elif aname and re.match('#?\d+$', aname):
                    if in_oral_answers:
                        self.out.write('</oral-heading>\n')
                        in_oral_answers = False
                    self.idA = int(re.match('#?(\d+)$', aname).group(1))
                    self.idB = 0
                    self.url = '%s#%s' % (self.baseurl, aname)
                    self.display_heading(ptext, timestamp, 'major')
                elif aname:
                    self.idB += 1
                    self.display_heading(ptext, timestamp, 'major')
                else:
                    self.idB += 1
                    self.display_heading(ptext, timestamp, 'minor')
                continue
            elif p.b or p.parent.name == 'b':
                if p.b:
                    new_speaker = p.b.find(text=True)
                else:
                    new_speaker = ptext
                if not re.match('\s*$', new_speaker):
                    self.display_speech()
                    speaker = re.sub("\s+", " ", new_speaker).strip()
                    speaker = re.sub(':', '', speaker)
                    id, str = memberList.match(speaker, self.date)
                    self.speaker = (str, timestamp)
                if p.b and p.b.nextSibling:
                    p.b.extract()
                    phtml = re.sub("\s+", " ",
                                   p.renderContents()).decode('utf-8')
                    self.text += "<p>%s</p>\n" % phtml
                continue
            match = re.match('(\d+)\.$', phtml)
            if match:
                oral_qn = match.group(1)
                continue
            if p.a and re.match('#\d+$', p.a.get('name', '')):
                raise ContextException, 'Uncaught title!'
            if re.match('Mr\w*(\s+\w)?\s+\w+:$', phtml):
                raise ContextException, 'Uncaught speaker! ' + phtml
            if oral_qn:
                phtml = "%s. %s" % (oral_qn, phtml)
                oral_qn = 0
            self.text += "<p>%s</p>\n" % phtml
        self.display_speech()
        if in_oral_answers:
            self.out.write('</oral-heading>\n')
            in_oral_answers = False
Ejemplo n.º 6
0
	def parse_day(self, soup):
		body = soup('p')
		match = re.match('\d\d(\d\d)-(\d\d)-(\d\d)(i?)$', self.date)
		urldate = '%s%s%s%s' % match.groups()
		self.baseurl = 'http://www.niassembly.gov.uk/record/reports/%s.htm' % urldate
		self.url = self.baseurl

		# Heading check
		if not re.match('Northern\s+Ireland\s+Assembly', body[0].find(text=True)):
			raise Exception, 'Missing NIA heading!'
		date_head = body[1].find(text=True)
		if not re.match('Contents', body[2].find(text=True)):
			raise Exception, 'Missing contents heading!'
		body = body[3:]
	
		timestamp = ''
		in_oral_answers = False
		oral_qn = 0
		self.speaker = (None, timestamp)
		self.text = ''
		for p in body:
			if not p(text=True): continue
			ptext = re.sub("\s+", " ", ''.join(p(text=True)))
			phtml = re.sub("\s+", " ", p.renderContents()).decode('utf-8')
			#print phtml
			if (p.a and p.a.get('href', ' ')[0] == '#') or (p.a and re.match('\d', p.a.get('href', ''))) or ptext=='&nbsp;':
				continue
			if p.findParent('i'):
				ts = self.time_period(ptext, optional=True)
				if ts:
					timestamp = ts
					continue
				#if self.speaker[0]:
				#	display_speech()
				#	self.speaker = (None, timestamp)
				match = re.search('(?:\(|\[)(?:Mr|Madam) Deputy Speaker (?:\[|\()(.*?)(?:\]|\))', phtml)
				if match:
					#print "Setting deputy to %s" % match.group(1)
					memberList.setDeputy(match.group(1))
				match = re.match('The Assembly met at (\d\d\.\d\d|noon)', phtml)
				if match:
					if match.group(1) == 'noon':
						timestamp = '12:00'
					else:
						timestamp = match.group(1)
					self.speaker = (self.speaker[0], timestamp)
				self.text += '<p class="italic">%s</p>\n' % phtml
				continue
			if p.findParent('font', size=1):
				self.text += '<p class="small">%s</p>\n' % phtml
				continue
			if (p.get('align', '') == 'center' and (p.b or p.parent.name == 'b')) or (p.parent.name == 'b' and re.search('Stage$', ptext)):
				self.display_speech()
				self.speaker = (None, timestamp)
				aname = p.a and p.a.get('name', '')
				if ptext == 'Oral Answers':
					self.out.write('<oral-heading>\n')
					in_oral_answers = True
					if aname and re.match('#?\d+$', aname):
						self.idA = int(re.match('#?(\d+)$', aname).group(1))
						self.idB = 0
						self.url = '%s#%s' % (self.baseurl, aname)
				elif aname and re.match('#?\d+$', aname):
					if in_oral_answers:
						self.out.write('</oral-heading>\n')
						in_oral_answers = False
					self.idA = int(re.match('#?(\d+)$', aname).group(1))
					self.idB = 0
					self.url = '%s#%s' % (self.baseurl, aname)
					self.display_heading(ptext, timestamp, 'major')
				elif aname:
					self.idB += 1
					self.display_heading(ptext, timestamp, 'major')
				else:
					self.idB += 1
					self.display_heading(ptext, timestamp, 'minor')
				continue
			elif p.b or p.parent.name == 'b':
				if p.b:
					new_speaker = p.b.find(text=True)
				else:
					new_speaker = ptext
				if not re.match('\s*$', new_speaker):
					self.display_speech()
					speaker = re.sub("\s+", " ", new_speaker).strip()
					speaker = re.sub(':', '', speaker)
					id, str = memberList.match(speaker, self.date)
					self.speaker = (str, timestamp)
				if p.b and p.b.nextSibling:
					p.b.extract()
					phtml = re.sub("\s+", " ", p.renderContents()).decode('utf-8')
					self.text += "<p>%s</p>\n" % phtml
				continue
			match = re.match('(\d+)\.$', phtml)
			if match:
				oral_qn = match.group(1)
				continue
			if p.a and re.match('#\d+$', p.a.get('name', '')):
				raise ContextException, 'Uncaught title!'
			if re.match('Mr\w*(\s+\w)?\s+\w+:$', phtml):
				raise ContextException, 'Uncaught speaker! ' + phtml
			if oral_qn:
				phtml = "%s. %s" % (oral_qn, phtml)
				oral_qn = 0
			self.text += "<p>%s</p>\n" % phtml
		self.display_speech()
		if in_oral_answers:
			self.out.write('</oral-heading>\n')
			in_oral_answers = False