Python Speech Examples

Programming Language: Python

Namespace/Package Name: billy.scrape.speeches

Class/Type: Speech

Examples at hotexamples.com: 2

Python Speech - 2 examples found. These are the top rated real world Python examples of billy.scrape.speeches.Speech extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

add_source(2)

Example #1

Show file

File: speeches.py Project: pdmholden/open13

    def scrape_hansard(self, session, chamber, url, hansard_id):
        subject = None
        procedure = None
        speech = None
        day = None
        sequence = 1

        page = self.lxmlize(url)
        for para in page.xpath(".//p"):
            try:
                klass = para.attrib['class'].strip()
            except KeyError:
                continue  # Some para entries have no class.

            if klass == 'SubjectHeading':
                subject = re.sub("\s+", " ", para.text_content()).strip()

            if klass == 'ProceduralHeading':
                procedure = re.sub("\s+", " ", para.text_content()).strip()

            if klass == 'SpeakerBegins':
                attribution = [x.text_content().strip() for x in
                               para.xpath(".//span[@class='Attribution']")]

                # XXX: Check if we have a J. Q. Public: at the begining
                # to mark as attributed. Early results show fail on picking
                # that up.

                if attribution == []:
                    logger.debug("Error: Speaker began without attribution")
                    logger.debug("  URL: %s" % (url))
                    logger.debug("  Txt: %s" % (para.text_content()[:30]))
                    continue

                if day is None:
                    logger.debug("Error: Day is None. Bad juju.")
                    logger.debug(url)
                    continue

                if speech:
                    self.save_object(speech)

                person = attribution[0]
                if person.endswith(":"):
                    person = person.rstrip(":")
                if person == "":
                    logger.debug("Error: empty person string. Bad juju.")
                    continue

                text = para.text_content()
                speech = Speech(session,
                                chamber,
                                hansard_id,
                                day,
                                sequence,
                                person,
                                text,
                                subject=subject,
                                section=procedure)
                speech.add_source(url)
                sequence += 1
                continue

            if klass == 'SpeakerContinues':
                if speech is None:
                    logger.debug("Continue before a begin. bad juju.")
                    continue

                text = para.text_content()
                speech['text'] += "\n%s" % (text)
                continue

            if klass == 'DateOfTranscript' or klass == 'TitlePageDate':
                date_text = para.text_content().strip().encode(
                    "ascii",
                    "ignore"
                )
                day = dt.datetime.strptime(date_text, "%A, %B %d, %Y")
                continue

        if speech:
            self.save_object(speech)

Example #2

Show file

File: speeches.py Project: drewstaylor/open13

    def scrape_day(self, session, chamber, day_url):
        doc = self.lxmlize(day_url)

        date = re.findall('Date=(\d{4}-\d{1,2}-\d{1,2})', day_url)[0]
        when = datetime.datetime.strptime(date, '%Y-%m-%d')
        sequence = 0
        last_h2 = ''
        section = ''
        speech = None

        transcript = doc.xpath('//div[@id="transcript"]')[0]
        # skip first item, navgation div
        for item in transcript.getchildren()[1:]:
            if item.tag == 'h2':
                # new major section
                last_h2 = clean_spaces(item.text_content())
                section = last_h2
            elif item.tag == 'h3':
                # new subsection
                section = last_h2 + ': ' + clean_spaces(item.text_content())
            elif item.tag == 'p' and item.get('class') == 'speakerStart':
                # new speaker
                # 99% of the time there are two children, <a>, <strong>
                # sometimes there are more (looks like format errors),
                # so we warn for now
                children = item.getchildren()
                a = children[0]
                strong = children[1]
                if len(children) > 2:
                    self.warning('found extra tags in speakerStart: %s',
                                 ', '.join(x.tag for x in children))
                anchor = day_url + '#' + a.get('name')
                speaker = strong.text_content().rstrip(':')
                text = strong.tail
                sequence += 1
                speech = Speech(session, chamber, 'floor-' + date, when,
                                sequence, speaker, text, section=section)
                speech.add_source(anchor)
            elif item.tag == 'p' and item.get('class') == 'timeStamp':
                timestamp = item.text_content()
                when = when.replace(hour=int(timestamp[:-2]),
                                    minute=int(timestamp[-2:]))
            elif item.tag == 'p' and (item.get('class') == 'procedure' or
                    item.get('class') == None and speech == None):
                # procedural action indicated by procedural tag or by
                # an empty tag with nobody speaking in prior session
                anchor = day_url + '#' + item.xpath('a')[0].get('name')
                sequence += 1

                if item.text_content().strip() == '':
                    continue

                speech = Speech(session, chamber, 'floor-' + date, when,
                                sequence, '-fixme-', item.text_content(),
                                section=section, type='procedure')
                speech.add_source(anchor)
                self.save_speech(speech)
                speech = None
            elif item.tag == 'p' and item.get('class') == None:
                if not item.text_content():
                    continue
                if len(item.getchildren()) > 1:
                    self.warning('found extra tags in speakerStart: %s',
                                 ', '.join(x.tag for x in children))
                speech['text'] += '\n\n' + item.text_content()
                self.save_speech(speech)
            elif item.tag == 'p':
                self.error('unknown p class=%s', item.get('class'))
            else:
                self.error('unexpected tag <%s>', item.tag)