def new_person_speak(self, p, timestamp): speaker = p.strong.find(text=True) speaker = re.sub(' ', '', speaker) speaker = re.sub("\s+", " ", speaker).strip() speaker = re.sub(':', '', speaker) id, stri = memberList.match(speaker, self.date) self.speaker = (stri, timestamp) p.strong.extract() phtml = p.renderContents() phtml = re.sub('^:\s*', '', phtml) phtml = re.sub("\s+", " ", phtml).decode('utf-8') self.text += "<p>%s</p>\n" % phtml
# Grab page ur = open('../rawdata/Members_of_the_NIA_2007') content = ur.read() ur.close() matcher = '<tr>\s+<td><a href="(/wiki/[^"]+)"[^>]*>([^<]+)</a></td>\s+<td><a href="/wiki/[^"]+" title="[^"]+">([^<]+)</a></td>'; matches = re.findall(matcher, content) matcher = '<tr>\s+<td><a href="(/wiki/[^"]+)"[^>]*>([^<]+)</a> \((?:resigned|deceased)\), replaced by <a href="/wiki/[^"]+"[^>]*>[^<]+</a></td>\s+<td><a href="/wiki/[^"]+" title="[^"]+">([^<]+)</a></td>'; matches.extend( re.findall(matcher, content) ) matcher = '<tr>\s+<td><a href="/wiki/[^"]+"[^>]*>[^<]+</a> \((?:resigned|deceased)\), replaced by <a href="(/wiki/[^"]+)"[^>]*>([^<]+)</a></td>\s+<td><a href="/wiki/[^"]+" title="[^"]+">([^<]+)</a></td>'; matches.extend( re.findall(matcher, content) ) for (url, name, cons) in matches: name = name.decode('utf-8') try: id, str = memberList.match(name, date_today) current_members.append(id) except Exception, e: try: id, str = memberList.match(name, '2011-01-01') except Exception, e: # For the resigned/died MLAs, use an earlier date id, str = memberList.match(name, '2007-01-01') #print >>sys.stderr, e pid = memberList.membertoperson(id) wikimembers[pid] = url print '''<?xml version="1.0" encoding="ISO-8859-1"?> <publicwhip>''' k = wikimembers.keys() k.sort()
def parse_day_old(self, body): match = re.match('\d\d(\d\d)-(\d\d)-(\d\d)(i?)$', self.date) urldate = '%s%s%s%s' % match.groups() self.baseurl = 'http://www.niassembly.gov.uk/record/reports/%s.htm' % urldate self.url = self.baseurl # Heading check if not re.match('Northern\s+Ireland\s+Assembly', body[0].find(text=True)): raise Exception, 'Missing NIA heading!' date_head = body[1].find(text=True) if not re.match('Contents', body[2].find(text=True)): raise Exception, 'Missing contents heading!' body = body[3:] timestamp = '' in_oral_answers = False oral_qn = 0 self.speaker = (None, timestamp) self.text = '' for p in body: if not p(text=True): continue ptext = re.sub("\s+", " ", ''.join(p(text=True))) phtml = re.sub("\s+", " ", p.renderContents()).decode('utf-8') #print phtml if (p.a and p.a.get('href', ' ')[0] == '#') or (p.a and re.match('\d', p.a.get('href', ''))) or ptext==' ': continue if p.findParent('i'): match = re.match('(\d\d?)\.(\d\d) (a|p)m', ptext) if match: hour = int(match.group(1)) if hour<12 and match.group(3) == 'p': hour += 12 timestamp = "%s:%s" % (hour, match.group(2)) continue #if self.speaker[0]: # display_speech() # self.speaker = (None, timestamp) match = re.search('(?:\(|\[)(?:Mr|Madam) Deputy Speaker (?:\[|\()(.*?)(?:\]|\))', phtml) if match: #print "Setting deputy to %s" % match.group(1) memberList.setDeputy(match.group(1)) match = re.match('The Assembly met at (\d\d\.\d\d|noon)', phtml) if match: if match.group(1) == 'noon': timestamp = '12:00' else: timestamp = match.group(1) self.speaker = (self.speaker[0], timestamp) self.text += '<p class="italic">%s</p>\n' % phtml continue if p.findParent('font', size=1): self.text += '<p class="small">%s</p>\n' % phtml continue if (p.get('align', '') == 'center' and (p.b or p.parent.name == 'b')) or (p.parent.name == 'b' and re.search('Stage$', ptext)): self.display_speech() self.speaker = (None, timestamp) aname = p.a and p.a.get('name', '') if ptext == 'Oral Answers': self.out.write('<oral-heading>\n') in_oral_answers = True if aname and re.match('#?\d+$', aname): self.idA = int(re.match('#?(\d+)$', aname).group(1)) self.idB = 0 self.url = '%s#%s' % (self.baseurl, aname) elif aname and re.match('#?\d+$', aname): if in_oral_answers: self.out.write('</oral-heading>\n') in_oral_answers = False self.idA = int(re.match('#?(\d+)$', aname).group(1)) self.idB = 0 self.url = '%s#%s' % (self.baseurl, aname) self.display_heading(ptext, timestamp, 'major') elif aname: self.idB += 1 self.display_heading(ptext, timestamp, 'major') else: self.idB += 1 self.display_heading(ptext, timestamp, 'minor') continue elif p.b or p.parent.name == 'b': if p.b: new_speaker = p.b.find(text=True) else: new_speaker = ptext if not re.match('\s*$', new_speaker): self.display_speech() speaker = re.sub("\s+", " ", new_speaker).strip() speaker = re.sub(':', '', speaker) id, str = memberList.match(speaker, self.date) self.speaker = (str, timestamp) if p.b and p.b.nextSibling: p.b.extract() phtml = re.sub("\s+", " ", p.renderContents()).decode('utf-8') self.text += "<p>%s</p>\n" % phtml continue match = re.match('(\d+)\.$', phtml) if match: oral_qn = match.group(1) continue if p.a and re.match('#\d+$', p.a.get('name', '')): raise ContextException, 'Uncaught title!' if re.match('Mr\w*(\s+\w)?\s+\w+:$', phtml): raise ContextException, 'Uncaught speaker! ' + phtml if oral_qn: phtml = "%s. %s" % (oral_qn, phtml) oral_qn = 0 self.text += "<p>%s</p>\n" % phtml self.display_speech() if in_oral_answers: self.out.write('</oral-heading>\n') in_oral_answers = False