Example #1
0
    def scrape_hansard(self, session, chamber, url, hansard_id):
        subject = None
        procedure = None
        speech = None
        day = None
        sequence = 1

        page = self.lxmlize(url)
        for para in page.xpath(".//p"):
            try:
                klass = para.attrib['class'].strip()
            except KeyError:
                continue  # Some para entries have no class.

            if klass == 'SubjectHeading':
                subject = re.sub("\s+", " ", para.text_content()).strip()

            if klass == 'ProceduralHeading':
                procedure = re.sub("\s+", " ", para.text_content()).strip()

            if klass == 'SpeakerBegins':
                attribution = [x.text_content().strip() for x in
                               para.xpath(".//span[@class='Attribution']")]

                # XXX: Check if we have a J. Q. Public: at the begining
                # to mark as attributed. Early results show fail on picking
                # that up.

                if attribution == []:
                    logger.debug("Error: Speaker began without attribution")
                    logger.debug("  URL: %s" % (url))
                    logger.debug("  Txt: %s" % (para.text_content()[:30]))
                    continue

                if day is None:
                    logger.debug("Error: Day is None. Bad juju.")
                    logger.debug(url)
                    continue

                if speech:
                    self.save_object(speech)

                person = attribution[0]
                if person.endswith(":"):
                    person = person.rstrip(":")
                if person == "":
                    logger.debug("Error: empty person string. Bad juju.")
                    continue

                text = para.text_content()
                speech = Speech(session,
                                chamber,
                                hansard_id,
                                day,
                                sequence,
                                person,
                                text,
                                subject=subject,
                                section=procedure)
                speech.add_source(url)
                sequence += 1
                continue

            if klass == 'SpeakerContinues':
                if speech is None:
                    logger.debug("Continue before a begin. bad juju.")
                    continue

                text = para.text_content()
                speech['text'] += "\n%s" % (text)
                continue

            if klass == 'DateOfTranscript' or klass == 'TitlePageDate':
                date_text = para.text_content().strip().encode(
                    "ascii",
                    "ignore"
                )
                day = dt.datetime.strptime(date_text, "%A, %B %d, %Y")
                continue

        if speech:
            self.save_object(speech)
Example #2
0
    def scrape_day(self, session, chamber, day_url):
        doc = self.lxmlize(day_url)

        date = re.findall('Date=(\d{4}-\d{1,2}-\d{1,2})', day_url)[0]
        when = datetime.datetime.strptime(date, '%Y-%m-%d')
        sequence = 0
        last_h2 = ''
        section = ''
        speech = None

        transcript = doc.xpath('//div[@id="transcript"]')[0]
        # skip first item, navgation div
        for item in transcript.getchildren()[1:]:
            if item.tag == 'h2':
                # new major section
                last_h2 = clean_spaces(item.text_content())
                section = last_h2
            elif item.tag == 'h3':
                # new subsection
                section = last_h2 + ': ' + clean_spaces(item.text_content())
            elif item.tag == 'p' and item.get('class') == 'speakerStart':
                # new speaker
                # 99% of the time there are two children, <a>, <strong>
                # sometimes there are more (looks like format errors),
                # so we warn for now
                children = item.getchildren()
                a = children[0]
                strong = children[1]
                if len(children) > 2:
                    self.warning('found extra tags in speakerStart: %s',
                                 ', '.join(x.tag for x in children))
                anchor = day_url + '#' + a.get('name')
                speaker = strong.text_content().rstrip(':')
                text = strong.tail
                sequence += 1
                speech = Speech(session, chamber, 'floor-' + date, when,
                                sequence, speaker, text, section=section)
                speech.add_source(anchor)
            elif item.tag == 'p' and item.get('class') == 'timeStamp':
                timestamp = item.text_content()
                when = when.replace(hour=int(timestamp[:-2]),
                                    minute=int(timestamp[-2:]))
            elif item.tag == 'p' and (item.get('class') == 'procedure' or
                    item.get('class') == None and speech == None):
                # procedural action indicated by procedural tag or by
                # an empty tag with nobody speaking in prior session
                anchor = day_url + '#' + item.xpath('a')[0].get('name')
                sequence += 1

                if item.text_content().strip() == '':
                    continue

                speech = Speech(session, chamber, 'floor-' + date, when,
                                sequence, '-fixme-', item.text_content(),
                                section=section, type='procedure')
                speech.add_source(anchor)
                self.save_speech(speech)
                speech = None
            elif item.tag == 'p' and item.get('class') == None:
                if not item.text_content():
                    continue
                if len(item.getchildren()) > 1:
                    self.warning('found extra tags in speakerStart: %s',
                                 ', '.join(x.tag for x in children))
                speech['text'] += '\n\n' + item.text_content()
                self.save_speech(speech)
            elif item.tag == 'p':
                self.error('unknown p class=%s', item.get('class'))
            else:
                self.error('unexpected tag <%s>', item.tag)