Beispiel #1
0
    def scrape_vote(self,
                    url,
                    date,
                    chamber,
                    passed,
                    motion,
                    re_digit=re.compile(r'\d{1,3}'),
                    re_totals=re.compile(
                        r'(?:Yes|No|Not Voting|Absent):\s{,3}(\d{,3})', re.I)):

        namespaces = {"re": "http://exslt.org/regular-expressions"}
        try:
            doc = lxml.html.fromstring(self.urlopen(url))
        except scrapelib.HTTPError as e:
            known_fail_links = [
                "http://legis.delaware.gov/LIS/lis146.nsf/7712cf7cc0e9227a852568470077336f/cdfd8149e79c2bb385257a24006e9f7a?OpenDocument"
            ]
            if "404" in str(e.response):
                # XXX: Ugh, ok, so there's no way (that I could find quickly)
                #      to get the _actual_ response (just "ok") from the object.
                #      As a result, this. Forgive me.
                #            -PRT
                if url in known_fail_links:
                    return
            raise

        xpath = ("//font[re:match(., '^(Yes|No|Not Voting|Absent):', 'i')]"
                 "/ancestor::tr[1]")

        # Get the vote tallies.
        try:
            totals = doc.xpath(xpath, namespaces=namespaces)
            totals = totals[0].text_content()

        except IndexError:
            # Here the vote page didn't have have the typical format.
            # Maybe it's a hand edited page. Log and try to parse
            # the vitals from plain text.
            self.log('Found an unusual votes page at url: "%s"' % url)
            totals = re_totals.findall(doc.text_content())
            if len(totals) == 4:
                self.log('...was able to parse vote tallies from "%s"' % url)

        else:
            totals = re_digit.findall(totals)

        try:
            yes_count, no_count, abstentions, absent = map(int, totals)

        except ValueError:
            # There were'nt any votes listed on this page. This is probably
            # a "voice vote" lacking actual vote tallies.
            yes_count, no_count, other_count = 0, 0, 0

        else:
            other_count = abstentions + absent

        # Create the vote object.
        vote = Vote(chamber, date, motion, passed, yes_count, no_count,
                    other_count)

        # Add source.
        vote.add_source(url)

        # Get the "vote type"
        el = doc.xpath('//font[contains(., "Vote Type:")]')[0]
        try:
            vote_type = el.xpath('following-sibling::font[1]/text()')[0]
        except IndexError:
            vote_type = el.xpath('../following-sibling::font[1]/text()')[0]

        vote['vote_type'] = vote_type

        # Get an iterator like: name1, vote1, name2, vote2, ...
        xpath = ("//font[re:match(., '^[A-Z]$')]"
                 "/../../../descendant::td/font/text()")
        data = doc.xpath(xpath, namespaces=namespaces)
        data = filter(lambda s: s.strip(), data)

        # Handle the rare case where not all names have corresponding
        # text indicating vote value. See e.g. session 146 HB10.
        data_len = len(data) / 2
        tally = sum(v for (k, v) in vote.items() if '_count' in k)

        if (0 < data_len) and ((data_len) != tally):
            xpath = ("//font[re:match(., '^[A-Z]$')]/ancestor::table")
            els = doc.xpath(xpath, namespaces=namespaces)[-1]
            els = els.xpath('descendant::td')
            data = [e.text_content().strip() for e in els]

        data = iter(data)

        # Add names and vote values.
        vote_map = {
            'Y': 'yes',
            'N': 'no',
        }

        while True:

            try:
                name = data.next()
                _vote = data.next()

                # Evidently, the motion for vote can be rescinded before
                # the vote is cast, perhaps due to a quorum failure.
                # (See the Senate vote (1/26/2011) for HB 10 w/HA 1.) In
                # this rare case, values in the vote col are whitespace. Skip.
                if not _vote.strip():
                    continue

                _vote = vote_map.get(_vote, 'other')
                getattr(vote, _vote)(name)

            except StopIteration:
                break

        return vote
Beispiel #2
0
    def scrape_vote(self, url,
                    re_digit=re.compile(r'\d{1,3}'),
                    re_totals=re.compile(
                        r'(?:Yes|No|Not Voting|Absent):\s{,3}(\d{,3})', re.I)):
        namespaces = {"re": "http://exslt.org/regular-expressions"}
        try:
            html = self.urlopen(url)
            doc = lxml.html.fromstring(html)
        except scrapelib.HTTPError as e:
            known_fail_links = [
                "http://legis.delaware.gov/LIS/lis146.nsf/7712cf7cc0e9227a852568470077336f/cdfd8149e79c2bb385257a24006e9f7a?OpenDocument"
            ]
            if "404" in str(e.response):
                # XXX: Ugh, ok, so there's no way (that I could find quickly)
                #      to get the _actual_ response (just "ok") from the object.
                #      As a result, this. Forgive me.
                #            -PRT
                if url in known_fail_links:
                    msg = 'Recieved a bogus 22/404 return code. Skipping vote.'
                    self.warning(msg)
                    return
            raise

        if 'Committee Report' in lxml.html.tostring(doc):
            # This was a committee vote with weird formatting.
            self.info('Skipping committee report.')
            return

        xpath = ("//font[re:match(., '^(Yes|No|Not Voting|Absent):', 'i')]"
                 "/ancestor::tr[1]")

        # Get the vote tallies.
        try:
            totals = doc.xpath(xpath, namespaces=namespaces)
            totals = totals[0].text_content()

        except IndexError:
            # Here the vote page didn't have have the typical format.
            # Maybe it's a hand edited page. Log and try to parse
            # the vitals from plain text.
            self.warning('Found an unusual votes page at url: "%s"' % url)
            totals = re_totals.findall(doc.text_content())
            if len(totals) == 4:
                self.warning('...was able to parse vote tallies from "%s"' %
                             url)

        else:
            totals = re_digit.findall(totals)

        try:
            yes_count, no_count, abstentions, absent = map(int, totals)

        except ValueError:
            # There were'nt any votes listed on this page. This is probably
            # a "voice vote" lacking actual vote tallies.
            yes_count, no_count, other_count = 0, 0, 0

        else:
            other_count = abstentions + absent

        font_text = [s.strip() for s in doc.xpath('//font/text()')]
        date_index = font_text.index('Date:')
        date_string = font_text[date_index + 2]
        date = datetime.strptime(date_string, '%m/%d/%Y %H:%M %p')
        passed = True if font_text[date_index + 4] else False
        counts = defaultdict(int)
        for key, string in [
            ('yes_count', 'Yes:'),
            ('no_count', 'No:'),
            ('absent_count', 'Absent:'),
                ('not_voting', 'Not Voting:')]:
            try:
                counts[key] = int(font_text[font_text.index(string) + 2])
            except ValueError:
                continue
        counts['other_count'] = counts['absent_count'] + counts['not_voting']

        chamber_string = doc.xpath('string(//b/u/font/text())').lower()
        if 'senate' in chamber_string:
            chamber = 'upper'
        elif 'house' in chamber_string:
            chamber = 'lower'

        for xpath in (
            'string(//td/b/text())',
            'string(//td/b/font/text())',
                'string(//form/b/font/text())'):
            motion = doc.xpath(xpath)
            if motion:
                break
            # Will fail at validictory level if no motion found.

        # Create the vote object.
        vote = Vote(chamber, date, motion, passed,
                    counts['yes_count'], counts['no_count'],
                    counts['other_count'])

        # Add source.
        vote.add_source(url)

        # Get the "vote type"
        el = doc.xpath('//font[contains(., "Vote Type:")]')[0]
        try:
            vote_type = el.xpath('following-sibling::font[1]/text()')[0]
        except IndexError:
            vote_type = el.xpath('../following-sibling::font[1]/text()')[0]

        vote['vote_type'] = vote_type

        # Get an iterator like: name1, vote1, name2, vote2, ...
        xpath = ("//font[re:match(., '^[A-Z]$')]"
                 "/../../../descendant::td/font/text()")
        data = doc.xpath(xpath, namespaces=namespaces)
        data = filter(lambda s: s.strip(), data)

        # Handle the rare case where not all names have corresponding
        # text indicating vote value. See e.g. session 146 HB10.
        data_len = len(data) / 2
        tally = sum(v for (k, v) in vote.items() if '_count' in k)

        if (0 < data_len) and ((data_len) != tally):
            xpath = ("//font[re:match(., '^[A-Z]$')]/ancestor::table")
            els = doc.xpath(xpath, namespaces=namespaces)[-1]
            els = els.xpath('descendant::td')
            data = [e.text_content().strip() for e in els]

        data = iter(data)

        # Add names and vote values.
        vote_map = {
            'Y': 'yes',
            'N': 'no',
        }

        while True:

            try:
                name = data.next()
                _vote = data.next()

                # Evidently, the motion for vote can be rescinded before
                # the vote is cast, perhaps due to a quorum failure.
                # (See the Senate vote (1/26/2011) for HB 10 w/HA 1.) In
                # this rare case, values in the vote col are whitespace. Skip.
                if not _vote.strip():
                    continue

                _vote = vote_map.get(_vote, 'other')
                getattr(vote, _vote)(name)

            except StopIteration:
                break

        return vote
Beispiel #3
0
    def scrape_vote(self, url, date, chamber, passed, motion,
                    re_digit=re.compile(r'\d{1,3}'),
                    re_totals=re.compile(
                        r'(?:Yes|No|Not Voting|Absent):\s{,3}(\d{,3})', re.I)):

        namespaces = {"re": "http://exslt.org/regular-expressions"}
        try:
            doc = lxml.html.fromstring(self.urlopen(url))
        except scrapelib.HTTPError as e:
            known_fail_links = [
                "http://legis.delaware.gov/LIS/lis146.nsf/7712cf7cc0e9227a852568470077336f/cdfd8149e79c2bb385257a24006e9f7a?OpenDocument"
            ]
            if "404" in str(e.response):
                # XXX: Ugh, ok, so there's no way (that I could find quickly)
                #      to get the _actual_ response (just "ok") from the object.
                #      As a result, this. Forgive me.
                #            -PRT
                if url in known_fail_links:
                    return
            raise

        xpath = ("//font[re:match(., '^(Yes|No|Not Voting|Absent):', 'i')]"
                 "/ancestor::tr[1]")

        # Get the vote tallies.
        try:
            totals = doc.xpath(xpath, namespaces=namespaces)
            totals = totals[0].text_content()

        except IndexError:
            # Here the vote page didn't have have the typical format.
            # Maybe it's a hand edited page. Log and try to parse
            # the vitals from plain text.
            self.log('Found an unusual votes page at url: "%s"' % url)
            totals = re_totals.findall(doc.text_content())
            if len(totals) == 4:
                self.log('...was able to parse vote tallies from "%s"' % url)

        else:
            totals = re_digit.findall(totals)


        try:
            yes_count, no_count, abstentions, absent = map(int, totals)

        except ValueError:
            # There were'nt any votes listed on this page. This is probably
            # a "voice vote" lacking actual vote tallies.
            yes_count, no_count, other_count = 0, 0, 0

        else:
            other_count = abstentions + absent

        # Create the vote object.
        vote = Vote(chamber, date, motion, passed,
                    yes_count, no_count, other_count)

        # Add source.
        vote.add_source(url)

        # Get the "vote type"
        el = doc.xpath('//font[contains(., "Vote Type:")]')[0]
        try:
            vote_type = el.xpath('following-sibling::font[1]/text()')[0]
        except IndexError:
            vote_type = el.xpath('../following-sibling::font[1]/text()')[0]

        vote['vote_type'] = vote_type

        # Get an iterator like: name1, vote1, name2, vote2, ...
        xpath = ("//font[re:match(., '^[A-Z]$')]"
                 "/../../../descendant::td/font/text()")
        data = doc.xpath(xpath, namespaces=namespaces)
        data = filter(lambda s: s.strip(), data)

        # Handle the rare case where not all names have corresponding
        # text indicating vote value. See e.g. session 146 HB10.
        data_len = len(data)/2
        tally = sum(v for (k, v) in vote.items() if '_count' in k)

        if (0 < data_len) and ((data_len) != tally):
            xpath = ("//font[re:match(., '^[A-Z]$')]/ancestor::table")
            els = doc.xpath(xpath, namespaces=namespaces)[-1]
            els = els.xpath('descendant::td')
            data = [e.text_content().strip() for e in els]

        data = iter(data)

        # Add names and vote values.
        vote_map = {
            'Y': 'yes',
            'N': 'no',
            }

        while True:

            try:
                name = data.next()
                _vote = data.next()

                # Evidently, the motion for vote can be rescinded before
                # the vote is cast, perhaps due to a quorum failure.
                # (See the Senate vote (1/26/2011) for HB 10 w/HA 1.) In
                # this rare case, values in the vote col are whitespace. Skip.
                if not _vote.strip():
                    continue

                _vote = vote_map.get(_vote, 'other')
                getattr(vote, _vote)(name)

            except StopIteration:
                break

        return vote
Beispiel #4
0
    def scrape_vote(self,
                    url,
                    re_digit=re.compile(r'\d{1,3}'),
                    re_totals=re.compile(
                        r'(?:Yes|No|Not Voting|Absent):\s{,3}(\d{,3})', re.I)):
        namespaces = {"re": "http://exslt.org/regular-expressions"}
        try:
            html = self.urlopen(url)
            doc = lxml.html.fromstring(html)
        except scrapelib.HTTPError as e:
            known_fail_links = [
                "http://legis.delaware.gov/LIS/lis146.nsf/7712cf7cc0e9227a852568470077336f/cdfd8149e79c2bb385257a24006e9f7a?OpenDocument",
                'http://legis.delaware.gov/LIS/lis147.nsf/7712cf7cc0e9227a852568470077336f/5f86852ea6649fa285257d08001bbe06?OpenDocument'
            ]
            if "404" in str(e.response):
                # XXX: Ugh, ok, so there's no way (that I could find quickly)
                #      to get the _actual_ response (just "ok") from the object.
                #      As a result, this. Forgive me.
                #            -PRT
                # XXX: THERE SHALL BE NO FORGIVENESS FOR PAULTAG!!!!
                #
                #       Just kidding. I blame Delaware.
                #            -TWN
                if url in known_fail_links:
                    msg = 'Recieved a bogus 22/404 return code. Skipping vote.'
                    self.warning(msg)
                    return
            raise

        if 'Committee Report' in lxml.html.tostring(doc):
            # This was a committee vote with weird formatting.
            self.info('Skipping committee report.')
            return

        xpath = ("//font[re:match(., '^(Yes|No|Not Voting|Absent):', 'i')]"
                 "/ancestor::tr[1]")

        # Get the vote tallies.
        try:
            totals = doc.xpath(xpath, namespaces=namespaces)
            totals = totals[0].text_content()

        except IndexError:
            # Here the vote page didn't have have the typical format.
            # Maybe it's a hand edited page. Log and try to parse
            # the vitals from plain text.
            self.warning('Found an unusual votes page at url: "%s"' % url)
            totals = re_totals.findall(doc.text_content())
            if len(totals) == 4:
                self.warning('...was able to parse vote tallies from "%s"' %
                             url)

        else:
            totals = re_digit.findall(totals)

        try:
            yes_count, no_count, abstentions, absent = map(int, totals)

        except ValueError:
            # There were'nt any votes listed on this page. This is probably
            # a "voice vote" lacking actual vote tallies.
            yes_count, no_count, other_count = 0, 0, 0

        else:
            other_count = abstentions + absent

        font_text = [s.strip() for s in doc.xpath('//font/text()')]
        date_index = font_text.index('Date:')
        date_string = font_text[date_index + 2]
        date = datetime.strptime(date_string, '%m/%d/%Y %H:%M %p')
        passed = True if font_text[date_index + 4] else False
        counts = defaultdict(int)
        for key, string in [('yes_count', 'Yes:'), ('no_count', 'No:'),
                            ('absent_count', 'Absent:'),
                            ('not_voting', 'Not Voting:')]:
            try:
                counts[key] = int(font_text[font_text.index(string) + 2])
            except ValueError:
                continue
        counts['other_count'] = counts['absent_count'] + counts['not_voting']

        chamber_string = doc.xpath('string(//b/u/font/text())').lower()
        if 'senate' in chamber_string:
            chamber = 'upper'
        elif 'house' in chamber_string:
            chamber = 'lower'

        for xpath in ('string(//td/b/text())', 'string(//td/b/font/text())',
                      'string(//form/b/font/text())'):
            motion = doc.xpath(xpath)
            if motion:
                break
            # Will fail at validictory level if no motion found.

        # Create the vote object.
        vote = Vote(chamber, date, motion, passed, counts['yes_count'],
                    counts['no_count'], counts['other_count'])

        # Add source.
        vote.add_source(url)

        # Get the "vote type"
        el = doc.xpath('//font[contains(., "Vote Type:")]')[0]
        try:
            vote_type = el.xpath('following-sibling::font[1]/text()')[0]
        except IndexError:
            vote_type = el.xpath('../following-sibling::font[1]/text()')[0]

        vote['vote_type'] = vote_type

        # Get an iterator like: name1, vote1, name2, vote2, ...
        xpath = ("//font[re:match(., '^[A-Z]$')]"
                 "/../../../descendant::td/font/text()")
        data = doc.xpath(xpath, namespaces=namespaces)
        data = filter(lambda s: s.strip(), data)

        # Handle the rare case where not all names have corresponding
        # text indicating vote value. See e.g. session 146 HB10.
        data_len = len(data) / 2
        tally = sum(v for (k, v) in vote.items() if '_count' in k)

        if (0 < data_len) and ((data_len) != tally):
            xpath = ("//font[re:match(., '^[A-Z]$')]/ancestor::table")
            els = doc.xpath(xpath, namespaces=namespaces)[-1]
            els = els.xpath('descendant::td')
            data = [e.text_content().strip() for e in els]

        data = iter(data)

        # Add names and vote values.
        vote_map = {
            'Y': 'yes',
            'N': 'no',
        }

        while True:

            try:
                name = data.next()
                _vote = data.next()

                # Evidently, the motion for vote can be rescinded before
                # the vote is cast, perhaps due to a quorum failure.
                # (See the Senate vote (1/26/2011) for HB 10 w/HA 1.) In
                # this rare case, values in the vote col are whitespace. Skip.
                if not _vote.strip():
                    continue

                _vote = vote_map.get(_vote, 'other')
                getattr(vote, _vote)(name)

            except StopIteration:
                break

        return vote