def saveProceedingsStatement(self, text, t): text = parsetools.sane_quotes(parsetools.tameWhitespace(text.strip())) if len(text): statement = Statement(hansard=self.hansard, time=datetime.datetime.combine(self.date, t['timestamp']), text=text, sequence=self.statement_index, who='Proceedings') self.statement_index += 1 self.statements.append(statement)
def _getHansardNumber(page): title = re.search(r'<title>([^<]+)</title>', page).group(1) match = re.search(r'Number +(\d+\S*) ', parsetools.tameWhitespace(title)) # New format: Number 079 if match: return re.sub('^0+', '', match.group(1)) else: match = re.search(r'\((\d+\S*)\)', title) # Old format (079) if match: return re.sub('^0+', '', match.group(1)) else: raise Exception("Couldn't parse number from Hansard title: %s" % title)
def saveProceedingsStatement(self, text, t): text = parsetools.sane_quotes(parsetools.tameWhitespace(text.strip())) if len(text): statement = Statement(hansard=self.hansard, time=datetime.datetime.combine( self.date, t['timestamp']), text=text, sequence=self.statement_index, who='Proceedings') self.statement_index += 1 self.statements.append(statement)
def saveProceedingsStatement(self, text, t): text = parsetools.sane_quotes(parsetools.tameWhitespace(text.strip())) if len(text): timestamp = t['timestamp'] if not isinstance(timestamp, datetime.datetime): # The older parser provides only datetime.time objects timestamp = datetime.datetime.combine(self.date, timestamp) statement = Statement(hansard=self.hansard, time=timestamp, text=text, sequence=self.statement_index, who='Proceedings') self.statement_index += 1 self.statements.append(statement)
def _getHansardNumber(page): title = re.search(r'<title>([^<]+)</title>', page).group(1) match = re.search( r'Number +(\d+\S*) ', parsetools.tameWhitespace(title)) # New format: Number 079 if match: return re.sub('^0+', '', match.group(1)) else: match = re.search(r'\((\d+\S*)\)', title) # Old format (079) if match: return re.sub('^0+', '', match.group(1)) else: raise Exception("Couldn't parse number from Hansard title: %s" % title)
def addText(self, text, blockquote=False): if not self._ignoretext: t = parsetools.tameWhitespace(text.strip()) t = parsetools.sane_quotes(t) if t.startswith(':'): # Strip initial colon t = t[1:].strip() if t.startswith('He said: '): t = t[8:].strip() if t.startswith('She said: '): t = t[9:].strip() if len(t) > 0 and not t.isspace(): #if t[0].islower() and not t.startswith('moved'): # print "WARNING: Block of text begins with lowercase letter: %s" % t if blockquote or (t.startswith('moved ') and not self.hasText()): self._textbuffer.append(u'> ' + t) else: self._textbuffer.append(t)
def appendToText(self, text, italic=False): if self.hasText() and not self._ignoretext: t = parsetools.tameWhitespace(text.strip()) if len(t) > 0 and not t.isspace(): if italic: t = u' <em>' + t + u'</em> ' self._textbuffer[-1] += t