Ejemplo n.º 1
0
 def saveProceedingsStatement(self, text, t):
     text = parsetools.sane_quotes(parsetools.tameWhitespace(text.strip()))
     if len(text):
         statement = Statement(hansard=self.hansard,
             time=datetime.datetime.combine(self.date, t['timestamp']),
             text=text, sequence=self.statement_index,
             who='Proceedings')
         self.statement_index += 1
         self.statements.append(statement)
def _getHansardNumber(page):
    title = re.search(r'<title>([^<]+)</title>', page).group(1)
    match = re.search(r'Number +(\d+\S*) ', parsetools.tameWhitespace(title)) # New format: Number 079
    if match:
        return re.sub('^0+', '', match.group(1))
    else:
        match = re.search(r'\((\d+\S*)\)', title) # Old format (079)
        if match:
            return re.sub('^0+', '', match.group(1))
        else:
            raise Exception("Couldn't parse number from Hansard title: %s" % title)
Ejemplo n.º 3
0
 def saveProceedingsStatement(self, text, t):
     text = parsetools.sane_quotes(parsetools.tameWhitespace(text.strip()))
     if len(text):
         statement = Statement(hansard=self.hansard,
                               time=datetime.datetime.combine(
                                   self.date, t['timestamp']),
                               text=text,
                               sequence=self.statement_index,
                               who='Proceedings')
         self.statement_index += 1
         self.statements.append(statement)
Ejemplo n.º 4
0
 def saveProceedingsStatement(self, text, t):
     text = parsetools.sane_quotes(parsetools.tameWhitespace(text.strip()))
     if len(text):
         timestamp = t['timestamp']
         if not isinstance(timestamp, datetime.datetime):
             # The older parser provides only datetime.time objects
             timestamp = datetime.datetime.combine(self.date, timestamp)
         statement = Statement(hansard=self.hansard,
             time=timestamp,
             text=text, sequence=self.statement_index,
             who='Proceedings')
         self.statement_index += 1
         self.statements.append(statement)
Ejemplo n.º 5
0
def _getHansardNumber(page):
    title = re.search(r'<title>([^<]+)</title>', page).group(1)
    match = re.search(
        r'Number +(\d+\S*) ',
        parsetools.tameWhitespace(title))  # New format: Number 079
    if match:
        return re.sub('^0+', '', match.group(1))
    else:
        match = re.search(r'\((\d+\S*)\)', title)  # Old format (079)
        if match:
            return re.sub('^0+', '', match.group(1))
        else:
            raise Exception("Couldn't parse number from Hansard title: %s" %
                            title)
Ejemplo n.º 6
0
 def saveProceedingsStatement(self, text, t):
     text = parsetools.sane_quotes(parsetools.tameWhitespace(text.strip()))
     if len(text):
         timestamp = t['timestamp']
         if not isinstance(timestamp, datetime.datetime):
             # The older parser provides only datetime.time objects
             timestamp = datetime.datetime.combine(self.date, timestamp)
         statement = Statement(hansard=self.hansard,
                               time=timestamp,
                               text=text,
                               sequence=self.statement_index,
                               who='Proceedings')
         self.statement_index += 1
         self.statements.append(statement)
Ejemplo n.º 7
0
 def addText(self, text, blockquote=False):
     if not self._ignoretext:
         t = parsetools.tameWhitespace(text.strip())
         t = parsetools.sane_quotes(t)
         if t.startswith(':'):
             # Strip initial colon
             t = t[1:].strip()
         if t.startswith('He said: '):
             t = t[8:].strip()
         if t.startswith('She said: '):
             t = t[9:].strip()
         if len(t) > 0 and not t.isspace():
             #if t[0].islower() and not t.startswith('moved'):
             #    print "WARNING: Block of text begins with lowercase letter: %s" % t
             if blockquote or (t.startswith('moved ') and not self.hasText()):
                 self._textbuffer.append(u'> ' + t)
             else:
                 self._textbuffer.append(t)
Ejemplo n.º 8
0
 def addText(self, text, blockquote=False):
     if not self._ignoretext:
         t = parsetools.tameWhitespace(text.strip())
         t = parsetools.sane_quotes(t)
         if t.startswith(':'):
             # Strip initial colon
             t = t[1:].strip()
         if t.startswith('He said: '):
             t = t[8:].strip()
         if t.startswith('She said: '):
             t = t[9:].strip()
         if len(t) > 0 and not t.isspace():
             #if t[0].islower() and not t.startswith('moved'):
             #    print "WARNING: Block of text begins with lowercase letter: %s" % t
             if blockquote or (t.startswith('moved ')
                               and not self.hasText()):
                 self._textbuffer.append(u'> ' + t)
             else:
                 self._textbuffer.append(t)
Ejemplo n.º 9
0
 def appendToText(self, text, italic=False):
     if self.hasText() and not self._ignoretext:
         t = parsetools.tameWhitespace(text.strip())
         if len(t) > 0 and not t.isspace():
             if italic: t = u' <em>' + t + u'</em> '
             self._textbuffer[-1] += t
Ejemplo n.º 10
0
 def appendToText(self, text, italic=False):
     if self.hasText() and not self._ignoretext:
         t = parsetools.tameWhitespace(text.strip())
         if len(t) > 0 and not t.isspace():
             if italic: t = u' <em>' + t + u'</em> '
             self._textbuffer[-1] += t