Beispiel #1
0
def _scrape_solo_bills(options, metadata):
    _clear_scraped_data(options.output_dir, 'bills')
    scraper = _get_configured_scraper('bills', options, metadata)

    if len(options.chambers) == 1:
        chamber = options.chambers[0]
    else:
        raise ScrapeError('must specify --chamber when providing a --bill')
    if len(options.sessions):
        session = list(options.sessions)[0]
    else:
        raise ScrapeError('must specify --session when providing a --bill')

    for bill_id in options.solo_bills:
        scraper.scrape_bill(chamber, session, bill_id)
Beispiel #2
0
def parse_exec_date(date_str):
    """
    Parse dates for executive actions.
    """
    match = re.search(r'((\w+) (\d{1,2}),\s?(\d{4,4}))', date_str)
    if match:
        date_str = "%s %s, %s" % (match.group(2), match.group(3),
                                  match.group(4))
        return datetime.datetime.strptime(date_str, "%B %d, %Y")

    match = re.search(r'((\w+), (\d{1,2}),\s?(\d{4,4}))', date_str)
    if match:
        date_str = "%s, %s, %s" % (match.group(2), match.group(3),
                                   match.group(4))
        return datetime.datetime.strptime(date_str, "%B, %d, %Y")

    match = re.search(r'(\d{1,2}/\d{1,2}/\d{4,4})', date_str)
    if match:
        return datetime.datetime.strptime(match.group(1), "%m/%d/%Y")

    match = re.search(r'(\d{1,2}/\d{1,2}/\d{2,2})', date_str)
    if match:
        return datetime.datetime.strptime(match.group(1), "%m/%d/%y")

    raise ScrapeError("Invalid executive action date: %s" % date_str)
Beispiel #3
0
def _run_scraper(scraper_type, options, metadata):
    """
        scraper_type: bills, legislators, committees, votes
    """
    _clear_scraped_data(options.output_dir, scraper_type)
    if scraper_type == 'speeches':
        _clear_scraped_data(options.output_dir, 'events')

    scraper = _get_configured_scraper(scraper_type, options, metadata)
    ua_email = os.environ.get('BILLY_UA_EMAIL')
    if ua_email and scraper:
        scraper.user_agent += ' ({})'.format(ua_email)
    if not scraper:
        return [{
            "type": scraper_type,
            "start_time": dt.datetime.utcnow(),
            "noscraper": True,
            "end_time": dt.datetime.utcnow()
        }]

    runs = []

    # Removed from the inner loop due to non-bicameral scrapers
    scrape = {"type": scraper_type}
    scrape['start_time'] = dt.datetime.utcnow()

    if scraper_type in ('bills', 'votes', 'events', 'speeches'):
        times = options.sessions
        for time in times:
            scraper.validate_session(time, scraper.latest_only)
    elif scraper_type in ('committees', 'legislators'):
        times = options.terms
        for time in times:
            scraper.validate_term(time, scraper.latest_only)

    # run scraper against year/session/term
    for time in times:
        # old style
        chambers = options.chambers
        if scraper_type == 'events' and len(options.chambers) == 2:
            chambers.append('other')

        if _is_old_scrape(scraper.scrape):
            for chamber in chambers:
                scraper.scrape(chamber, time)
        else:
            scraper.scrape(time, chambers=chambers)

        # error out if events or votes don't scrape anything
        if not scraper.object_count and scraper_type not in ('events',
                                                             'votes'):
            raise ScrapeError("%s scraper didn't save any objects" %
                              scraper_type)

    scrape['end_time'] = dt.datetime.utcnow()
    runs.append(scrape)

    return runs
Beispiel #4
0
    def scrape_bill(self, bill, url):
        with self.urlopen(url) as page:
            page.replace(' ', ' ')
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)
            bill.add_source(url)

            for link in page.xpath("//b[text()='Bill Text:']/"
                                   "following-sibling::blockquote/a"):
                bill.add_version(link.text, link.attrib['href'])

            more_sponsor_link = page.xpath("//a[text()='More Sponsors']")
            if page.xpath("//a[text()='More Sponsors']"):
                sponsor_url = more_sponsor_link[0].attrib['href']
                self.scrape_sponsors(bill, sponsor_url)
            else:
                for b in page.xpath("//td[text()='Sponsor(s):']/../td[2]/b"):
                    bill.add_sponsor("sponsor", b.text)

            for tr in page.xpath("""
            //b[text()='Detailed Status:']/
            following-sibling::blockquote[1]/table/tr""")[1:]:
                action = tr.xpath("string(td[3])").strip()

                match = re.search('(to|by) Governor on (.*)', action)
                if match:
                    date = parse_exec_date(match.group(2).strip()).date()
                    actor = 'executive'
                else:
                    if tr.attrib['bgcolor'] == 'Salmon':
                        actor = 'lower'
                    elif tr.attrib['bgcolor'] == 'LightGreen':
                        actor = 'upper'
                    else:
                        raise ScrapeError("Invalid row color: %s" %
                                          tr.attrib['bgcolor'])

                    date = tr.xpath("string(td[1])")
                    try:
                        date = re.search(
                            r"\d\d?/\d\d?/\d{4,4}", date).group(0)
                    except AttributeError:
                        # No date, skip
                        self.warning("skipping action '%s -- %s'" % (
                            date, action))
                        continue

                    date = datetime.datetime.strptime(date, "%m/%d/%Y")
                    date = date.date()

                bill.add_action(actor, action, date,
                                type=action_type(action))

                for vote_link in tr.xpath("td[4]/a"):
                    self.scrape_vote(bill, actor, vote_link.attrib['href'])

            self.save_bill(bill)
Beispiel #5
0
    def scrape_votes_old(self, bill, billname, session):
        vote_url = ('http://archives.legislature.state.oh.us/bills.cfm?ID=' +
                    session + '_' + billname)

        page = self.get(vote_url).text
        page = lxml.html.fromstring(page)

        for jlink in page.xpath("//a[contains(@href, 'JournalText')]"):
            date = datetime.datetime.strptime(jlink.text,
                                              "%m/%d/%Y").date()

            details = jlink.xpath("string(../../../td[2])")

            chamber = details.split(" - ")[0]
            if chamber == 'House':
                chamber = 'lower'
            elif chamber == 'Senate':
                chamber = 'upper'
            else:
                raise ScrapeError("Bad chamber: %s" % chamber)

            motion = details.split(" - ")[1].split("\n")[0].strip()

            vote_row = jlink.xpath("../../..")[0].getnext()

            yea_div = vote_row.xpath(
                "td/font/div[contains(@id, 'Yea')]")[0]
            yeas = []
            for td in yea_div.xpath("table/tr/td"):
                name = td.xpath("string()")
                if name:
                    yeas.append(name)

            no_div = vote_row.xpath(
                "td/font/div[contains(@id, 'Nay')]")[0]
            nays = []
            for td in no_div.xpath("table/tr/td"):
                name = td.xpath("string()")
                if name:
                    nays.append(name)

            yes_count = len(yeas)
            no_count = len(nays)

            vote = Vote(chamber, date, motion, yes_count > no_count,
                        yes_count, no_count, 0)

            for yes in yeas:
                vote.yes(yes)
            for no in nays:
                vote.no(no)

            vote.add_source(vote_url)

            bill.add_vote(vote)
Beispiel #6
0
    def scrape_events(self, chamber, session, event_id):
        url = '%s%s' % (self.upper_url, event_id)
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)
        rows = doc.xpath("//div[@id='WebPartWPQ2']")
        #some ids are empty
        if len(rows):
            table_data = rows[0].find('table')[1]

            for link in table_data.iterchildren('td'):
                td = link.xpath('//td[@class="ms-formbody"]')

                description = td[18].text
                when = td[19].text
                where = td[25].text
                type = td[27].text
                meeting_lead = td[28].text

                when = datetime.datetime.strptime(when, "%m/%d/%Y  %H:%M %p")
                when = self._tz.localize(when)
                event_type = 'committee:meeting'
                kwargs = {"location": "State House"}
                if where is not None and where != "":
                    kwargs['location'] = where
                event = Event(session, when, event_type, description, **kwargs)

                if td[20].text is None:
                    participants = meeting_lead
                else:
                    participants = td[20].text.split(';')
                if participants:
                    for participant in participants:
                        name = participant.strip().replace('HON.', '', 1)
                        if name != "":
                            event.add_participant('committee',
                                                  name,
                                                  'committee',
                                                  chamber=chamber)

                event.add_source(url)
                self.save_event(event)
        else:
            #hack so we dont fail on the first id numbers where there are some gaps between the numbers that work and not.
            if event_id > 1700:
                raise ScrapeError(
                    "Parsing is done we are on future ids that are not used yet."
                )
Beispiel #7
0
    def scrape(self, chamber, session):
        # check for abiword
        if os.system('which abiword') != 0:
            raise ScrapeError('abiword is required for PR scraping')

        year = session[0:4]
        self.base_url = 'http://www.oslpr.org/legislatura/tl%s/tl_medida_print2.asp' % year
        chamber_letter = {'lower': 'C', 'upper': 'S'}[chamber]
        for code, type in self.bill_types.iteritems():
            counter = itertools.count(1)
            for n in counter:
                bill_id = '%s%s%s' % (code, chamber_letter, n)
                try:
                    self.scrape_bill(chamber, session, bill_id, type)
                except NoSuchBill:
                    break
Beispiel #8
0
def parse_exec_date(date_str):
    """
    Parse dates for executive actions.
    """
    match = re.search('(\w+ \d{1,2}, \d{4,4})', date_str)
    if match:
        return datetime.datetime.strptime(match.group(1), "%B %d, %Y")

    match = re.search('(\d{1,2}/\d{1,2}/\d{4,4})', date_str)
    if match:
        return datetime.datetime.strptime(match.group(1), "%m/%d/%Y")

    match = re.search('(\d{1,2}/\d{1,2}/\d{2,2})', date_str)
    if match:
        return datetime.datetime.strptime(match.group(1), "%m/%d/%y")

    raise ScrapeError("Invalid executive action date: %s" % date_str)
Beispiel #9
0
    def recap(self, data):
        """Extract bill ids from daily recap page.
        Splits page into sections, and returns list containing bill ids
        """
        # throw away everything before <body>
        start = data.index("<body>")
        stop = data.index("</body>", start)

        bill_id_exp = re.compile(">(?P<id>\w\. \d{1,4}?)</a> \(<a href=")
        billids = set()
        if stop >= 0 and stop > start:
            all = re.compile("/cgi-bin/web_bh10.exe").split(data[start:stop])
            for part in all[1:]:
                result = bill_id_exp.search(part)
                if result:
                    bill_id = result.group('id')
                    billids.add(bill_id)

            return billids

        raise ScrapeError("recap: bad format %s" % data)
Beispiel #10
0
    def parse_bill_xml(self, chamber, session, txt):
        root = lxml.etree.fromstring(txt)
        bill_id = ' '.join(root.attrib['bill'].split(' ')[1:])
        bill_title = root.findtext("caption")

        if session[2] == 'R':
            session = session[0:2]

        if bill_id[1] == 'B':
            bill_type = ['bill']
        elif bill_id[1] == 'R':
            bill_type = ['resolution']
        elif bill_id[1:3] == 'CR':
            bill_type = ['concurrent resolution']
        elif bill_id[1:3] == 'JR':
            bill_type = ['joint resolution']
        else:
            raise ScrapeError("Invalid bill_id: %s" % bill_id)

        bill = Bill(session, chamber, bill_id, bill_title, type=bill_type)

        for action in root.findall('actions/action'):
            act_date = datetime.datetime.strptime(action.findtext('date'),
                                            "%m/%d/%Y").date()

            extra = {}
            extra['action_number'] = action.find('actionNumber').text
            comment = action.find('comment')
            if comment is not None and comment.text:
                extra['comment'] = comment.text.strip()

            actor = {'H': 'lower',
                     'S': 'upper',
                     'E': 'executive'}[extra['action_number'][0]]

            desc = action.findtext('description').strip()

            if desc == 'Scheduled for public hearing on . . .':
                continue

            if desc == 'Amended':
                atype = 'amendment:passed'
            elif desc == 'Amendment(s) offered':
                atype = 'amendment:introduced'
            elif desc == 'Amendment amended':
                atype = 'amendment:amended'
            elif desc == 'Amendment withdrawn':
                atype = 'amendment:withdrawn'
            elif desc == 'Passed' or desc == 'Adopted':
                atype = 'bill:passed'
            elif re.match(r'^Received (by|from) the', desc):
                if 'Secretary of the Senate' not in desc:
                    atype = 'bill:introduced'
                else:
                    atype = 'other'
            elif desc.startswith('Sent to the Governor'):
                # But what if it gets lost in the mail?
                atype = 'governor:received'
            elif desc.startswith('Signed by the Governor'):
                atype = 'governor:signed'
            elif desc == 'Read first time':
                atype = ['bill:introduced', 'bill:reading:1']
                introduced = True
            elif desc == 'Read & adopted':
                atype = 'bill:passed'
            elif desc.startswith('Referred to'):
                atype = 'committee:referred'
            elif desc == "Filed":
                atype = 'bill:filed'
            else:
                atype = 'other'

            bill.add_action(actor, action.findtext('description'),
                            act_date, type=atype, **extra)

        for author in root.findtext('authors').split(' | '):
            if author != "":
                bill.add_sponsor('author', author)
        for coauthor in root.findtext('coauthors').split(' | '):
            if coauthor != "":
                bill.add_sponsor('coauthor', coauthor)
        for sponsor in root.findtext('sponsors').split(' | '):
            if sponsor != "":
                bill.add_sponsor('sponsor', sponsor)
        for cosponsor in root.findtext('cosponsors').split(' | '):
            if cosponsor != "":
                bill.add_sponsor('cosponsor', cosponsor)

        bill['subjects'] = []
        for subject in root.iterfind('subjects/subject'):
            bill['subjects'].append(subject.text.strip())

        return bill
Beispiel #11
0
def main():

    parser = argparse.ArgumentParser(
        description='Scrape data for state, saving data to disk.',
        parents=[base_arg_parser],
    )

    parser.add_argument('state', type=str,
                        help='state scraper module (eg. nc)')
    parser.add_argument('-s', '--session', action='append', dest='sessions',
                        help='session(s) to scrape')
    parser.add_argument('-t', '--term', action='append', dest='terms',
                        help='term(s) to scrape')
    parser.add_argument('--upper', action='store_true', dest='upper',
                        default=False, help='scrape upper chamber')
    parser.add_argument('--lower', action='store_true', dest='lower',
                        default=False, help='scrape lower chamber')
    parser.add_argument('--bills', action='store_true', dest='bills',
                        default=False, help="scrape bill data")
    parser.add_argument('--legislators', action='store_true',
                        dest='legislators', default=False,
                        help="scrape legislator data")
    parser.add_argument('--committees', action='store_true', dest='committees',
                        default=False, help="scrape committee data")
    parser.add_argument('--votes', action='store_true', dest='votes',
                        default=False, help="scrape vote data")
    parser.add_argument('--events', action='store_true', dest='events',
                        default=False, help='scrape event data')
    parser.add_argument('--alldata', action='store_true', dest='alldata',
                        default=False,
                        help="scrape all available types of data")
    parser.add_argument('--strict', action='store_true', dest='strict',
                        default=False, help="fail immediately when"
                        "encountering validation warning")
    parser.add_argument('-n', '--no_cache', action='store_true',
                        dest='no_cache', help="don't use web page cache")
    parser.add_argument('--fastmode', help="scrape in fast mode",
                        action="store_true", default=False)
    parser.add_argument('-r', '--rpm', action='store', type=int, dest='rpm',
                        default=60),
    parser.add_argument('--timeout', action='store', type=int, dest='timeout',
                        default=10)

    args = parser.parse_args()

    settings.update(args)

    # set up search path
    sys.path.insert(0, os.path.join(os.path.dirname(__file__),
                                    '../../openstates'))

    # get metadata
    metadata = __import__(args.state, fromlist=['metadata']).metadata
    state = metadata['abbreviation']

    configure_logging(args.verbose, args.state)

    # make output dir
    args.output_dir = os.path.join(settings.BILLY_DATA_DIR, args.state)
    try:
        os.makedirs(args.output_dir)
    except OSError as e:
        if e.errno != 17:
            raise e

    # write metadata
    try:
        schema_path = os.path.join(os.path.split(__file__)[0],
                                   '../schemas/metadata.json')
        schema = json.load(open(schema_path))

        validator = DatetimeValidator()
        validator.validate(metadata, schema)
    except ValueError as e:
        logging.getLogger('billy').warning('metadata validation error: '
                                                 + str(e))

    with open(os.path.join(args.output_dir, 'state_metadata.json'), 'w') as f:
        json.dump(metadata, f, cls=JSONDateEncoder)

    # determine time period to run for
    if args.terms:
        for term in metadata['terms']:
            if term in args.terms:
                args.sessions.extend(term['sessions'])
    args.sessions = set(args.sessions or [])

    # determine chambers
    args.chambers = []
    if args.upper:
        args.chambers.append('upper')
    if args.lower:
        args.chambers.append('lower')
    if not args.chambers:
        args.chambers = ['upper', 'lower']

    if not (args.bills or args.legislators or args.votes or
            args.committees or args.events or args.alldata):
        raise ScrapeError("Must specify at least one of --bills, "
                          "--legislators, --committees, --votes, --events, "
                          "--alldata")

    if args.alldata:
        args.bills = True
        args.legislators = True
        args.votes = True
        args.committees = True

    if args.bills:
        _run_scraper(args.state, state, 'bills', args, metadata)
    if args.legislators:
        _run_scraper(args.state, state, 'legislators', args, metadata)
    if args.committees:
        _run_scraper(args.state, state, 'committees', args, metadata)
    if args.votes:
        _run_scraper(args.state, state, 'votes', args, metadata)
    if args.events:
        _run_scraper(args.state, state, 'events', args, metadata)
Beispiel #12
0
    def scrape_bill(self, session, history_url):
        history_xml = self.get(history_url).content
        root = etree.fromstring(history_xml)

        bill_title = root.findtext("caption")
        if (bill_title is None or "Bill does not exist" in history_xml):
            self.warning("Bill does not appear to exist")
            return
        bill_id = ' '.join(root.attrib['bill'].split(' ')[1:])

        chamber = self.CHAMBERS[bill_id[0]]

        if bill_id[1] == 'B':
            bill_type = ['bill']
        elif bill_id[1] == 'R':
            bill_type = ['resolution']
        elif bill_id[1:3] == 'CR':
            bill_type = ['concurrent resolution']
        elif bill_id[1:3] == 'JR':
            bill_type = ['joint resolution']
        else:
            raise ScrapeError("Invalid bill_id: %s" % bill_id)

        bill = Bill(session, chamber, bill_id, bill_title, type=bill_type)

        bill.add_source(history_url)

        bill['subjects'] = []
        for subject in root.iterfind('subjects/subject'):
            bill['subjects'].append(subject.text.strip())

        versions = [x for x in self.versions if x[0] == bill_id]
        for version in versions:
            bill.add_version(name=self.NAME_SLUGS[version[1][-5]],
                             url=version[1],
                             mimetype='text/html')

        analyses = [x for x in self.analyses if x[0] == bill_id]
        for analysis in analyses:
            bill.add_document(name="Analysis ({})".format(
                self.NAME_SLUGS[analysis[1][-5]]),
                              url=analysis[1],
                              mimetype='text/html')

        fiscal_notes = [x for x in self.fiscal_notes if x[0] == bill_id]
        for fiscal_note in fiscal_notes:
            bill.add_document(name="Fiscal Note ({})".format(
                self.NAME_SLUGS[fiscal_note[1][-5]]),
                              url=fiscal_note[1],
                              mimetype='text/html')

        witnesses = [x for x in self.witnesses if x[0] == bill_id]
        for witness in witnesses:
            bill.add_document(name="Witness List ({})".format(
                self.NAME_SLUGS[witness[1][-5]]),
                              url=witness[1],
                              mimetype='text/html')

        for action in root.findall('actions/action'):
            act_date = datetime.datetime.strptime(action.findtext('date'),
                                                  "%m/%d/%Y").date()

            extra = {}
            extra['action_number'] = action.find('actionNumber').text
            comment = action.find('comment')
            if comment is not None and comment.text:
                extra['comment'] = comment.text.strip()

            actor = {
                'H': 'lower',
                'S': 'upper',
                'E': 'executive'
            }[extra['action_number'][0]]

            desc = action.findtext('description').strip()

            if desc == 'Scheduled for public hearing on . . .':
                self.warning("Skipping public hearing action with no date")
                continue

            introduced = False

            if desc == 'Amended':
                atype = 'amendment:passed'
            elif desc == 'Amendment(s) offered':
                atype = 'amendment:introduced'
            elif desc == 'Amendment amended':
                atype = 'amendment:amended'
            elif desc == 'Amendment withdrawn':
                atype = 'amendment:withdrawn'
            elif desc == 'Passed' or desc == 'Adopted':
                atype = 'bill:passed'
            elif re.match(r'^Received (by|from) the', desc):
                if 'Secretary of the Senate' not in desc:
                    atype = 'bill:introduced'
                else:
                    atype = 'bill:filed'
            elif desc.startswith('Sent to the Governor'):
                # But what if it gets lost in the mail?
                atype = 'governor:received'
            elif desc.startswith('Signed by the Governor'):
                atype = 'governor:signed'
            elif desc == 'Vetoed by the Governor':
                atype = 'governor:vetoed'
            elif desc == 'Read first time':
                atype = ['bill:introduced', 'bill:reading:1']
                introduced = True
            elif desc == 'Read & adopted':
                atype = ['bill:passed']
                if not introduced:
                    introduced = True
                    atype.append('bill:introduced')
            elif desc == "Passed as amended":
                atype = 'bill:passed'
            elif (desc.startswith('Referred to')
                  or desc.startswith("Recommended to be sent to ")):
                atype = 'committee:referred'
            elif desc == "Reported favorably w/o amendment(s)":
                atype = 'committee:passed'
            elif desc == "Filed":
                atype = 'bill:filed'
            elif desc == 'Read 3rd time':
                atype = 'bill:reading:3'
            elif desc == 'Read 2nd time':
                atype = 'bill:reading:2'
            elif desc.startswith('Reported favorably'):
                atype = 'committee:passed:favorable'
            else:
                atype = 'other'

            if 'committee:referred' in atype:
                repls = ['Referred to', "Recommended to be sent to "]
                ctty = desc
                for r in repls:
                    ctty = ctty.replace(r, "").strip()
                extra['committees'] = ctty

            bill.add_action(actor,
                            action.findtext('description'),
                            act_date,
                            type=atype,
                            **extra)

        for author in root.findtext('authors').split(' | '):
            if author != "":
                bill.add_sponsor('primary', author, official_type='author')
        for coauthor in root.findtext('coauthors').split(' | '):
            if coauthor != "":
                bill.add_sponsor('cosponsor',
                                 coauthor,
                                 official_type='coauthor')
        for sponsor in root.findtext('sponsors').split(' | '):
            if sponsor != "":
                bill.add_sponsor('primary', sponsor, official_type='sponsor')
        for cosponsor in root.findtext('cosponsors').split(' | '):
            if cosponsor != "":
                bill.add_sponsor('cosponsor',
                                 cosponsor,
                                 official_type='cosponsor')

        self.save_bill(bill)
Beispiel #13
0
    def scrape_bill(self, bill, url):
        page = self.urlopen(url)
        page.replace('&nbsp;', ' ')
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)
        bill.add_source(url)

        for link in page.xpath("//b[text()='Bill Text:']/"
                               "following-sibling::blockquote[1]//a"):
            if link.attrib['href'].endswith('pdf'):
                mimetype = 'application/pdf'
            elif link.attrib['href'].endswith('htm'):
                mimetype = 'text/html'
            bill.add_version(link.text, link.attrib['href'], mimetype=mimetype)

        more_sponsor_link = page.xpath("//a[text()='More Sponsors']")
        if page.xpath("//a[text()='More Sponsors']"):
            sponsor_url = more_sponsor_link[0].attrib['href']
            self.scrape_sponsors(bill, sponsor_url)
        else:
            for b in page.xpath("//td[text()='Sponsor(s):']/../td[2]/b"):
                bill.add_sponsor("primary", b.text)

        for tr in page.xpath("""
        //b[text()='Detailed Status:']/
        following-sibling::blockquote[1]/table/tr""")[1:]:
            action = tr.xpath("string(td[3])").strip()

            match = re.search('(to|by) Governor on (.*)', action)
            if match:
                date = parse_exec_date(match.group(2).strip()).date()
                actor = 'executive'
            else:
                if tr.attrib['bgcolor'] == 'Salmon':
                    actor = 'lower'
                elif tr.attrib['bgcolor'] == 'LightGreen':
                    actor = 'upper'
                else:
                    raise ScrapeError("Invalid row color: %s" %
                                      tr.attrib['bgcolor'])

                date = tr.xpath("string(td[1])")
                try:
                    date = re.search(r"\d\d?/\d\d?/\d{4,4}", date).group(0)
                except AttributeError:
                    # No date, skip
                    self.warning("skipping action '%s -- %s'" % (date, action))
                    continue

                date = datetime.datetime.strptime(date, "%m/%d/%Y")
                date = date.date()

            types, attrs = self.categorizer.categorize(action)
            action = dict(actor=actor, action=action, date=date, type=types)
            action.update(**attrs)
            bill.add_action(**action)

            for vote_link in tr.xpath("td[4]/a"):
                self.scrape_vote(bill, actor, vote_link.attrib['href'])

        # If nearly all of the bill attributes but the title are blank, this is a bad bill.
        # See Issue #166.
        if all(
                len(bill[x]) == 0
                for x in ('votes', 'alternate_titles', 'sponsors', 'actions',
                          'versions', 'documents')):
            return False

        self.save_bill(bill)
        return True
Beispiel #14
0
def main():
    try:
        parser = argparse.ArgumentParser(
            description='update billy data',
            parents=[base_arg_parser],
        )

        what = parser.add_argument_group(
            'what to scrape', 'flags that help select what data to scrape')
        scrape = parser.add_argument_group('scraper config',
                                           'settings for the scraper')

        parser.add_argument('module', type=str, help='scraper module (eg. nc)')
        parser.add_argument('--pdb', action='store_true', default=False,
                            help='invoke PDB when exception is raised')
        parser.add_argument('--ipdb', action='store_true', default=False,
                            help='invoke PDB when exception is raised')
        parser.add_argument('--pudb', action='store_true', default=False,
                            help='invoke PUDB when exception is raised')
        what.add_argument('-s', '--session', action='append',
                          dest='sessions', default=[],
                          help='session(s) to scrape')
        what.add_argument('-t', '--term', action='append', dest='terms',
                          help='term(s) to scrape', default=[])

        for arg in ('upper', 'lower'):
            what.add_argument('--' + arg, action='append_const',
                              dest='chambers', const=arg)
        for arg in ('bills', 'legislators', 'committees',
                    'votes', 'events', 'speeches'):
            what.add_argument('--' + arg, action='append_const', dest='types',
                              const=arg)
        for arg in ('scrape', 'import', 'report', 'session-list'):
            parser.add_argument('--' + arg, dest='actions',
                                action="append_const", const=arg,
                                help='only run %s step' % arg)

        # special modes for debugging
        scrape.add_argument('--nonstrict', action='store_false', dest='strict',
                            default=True, help="don't fail immediately when"
                            " encountering validation warning")
        scrape.add_argument('--fastmode', help="scrape in fast mode",
                            action="store_true", default=False)

        # scrapelib overrides
        scrape.add_argument('-r', '--rpm', action='store', type=int,
                            dest='SCRAPELIB_RPM')
        scrape.add_argument('--timeout', action='store', type=int,
                            dest='SCRAPELIB_TIMEOUT')
        scrape.add_argument('--retries', type=int,
                            dest='SCRAPELIB_RETRY_ATTEMPTS')
        scrape.add_argument('--retry_wait', type=int,
                            dest='SCRAPELIB_RETRY_WAIT_SECONDS')

        args = parser.parse_args()

        if args.pdb or args.pudb or args.ipdb:
            _debugger = pdb
            if args.pudb:
                try:
                    import pudb
                    _debugger = pudb
                except ImportError:
                    pass
            if args.ipdb:
                try:
                    import ipdb
                    _debugger = ipdb
                except ImportError:
                    pass

            # turn on PDB-on-error mode
            # stolen from http://stackoverflow.com/questions/1237379/
            # if this causes problems in interactive mode check that page
            def _tb_info(type, value, tb):
                traceback.print_exception(type, value, tb)
                _debugger.pm()
            sys.excepthook = _tb_info

        # inject scraper paths so scraper module can be found
        for newpath in settings.SCRAPER_PATHS:
            sys.path.insert(0, newpath)

        # get metadata
        module = importlib.import_module(args.module)
        metadata = module.metadata
        module_settings = getattr(module, 'settings', {})
        abbrev = metadata['abbreviation']

        # load module settings, then command line settings
        settings.update(module_settings)
        settings.update(args)

        # make output dir
        args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbrev)

        # if terms aren't set, use latest
        if not args.terms:
            if args.sessions:
                for session in args.sessions:
                    args.terms.append(
                        term_for_session(metadata['abbreviation'], session,
                                         metadata))
                args.terms = list(set(args.terms or []))
            else:
                latest_term = metadata['terms'][-1]['name']
                args.terms = [latest_term]
        # only set sessions from terms if sessions weren't set
        elif not args.sessions:
            for term in metadata['terms']:
                if term['name'] in args.terms:
                    args.sessions.extend(term['sessions'])
            # dedup sessions
            args.sessions = list(set(args.sessions or []))

        if not args.sessions:
            args.sessions = [metadata['terms'][-1]['sessions'][-1]]

        # determine chambers
        if not args.chambers:
            args.chambers = ['upper', 'lower']

        if not args.actions:
            args.actions = ['scrape', 'import', 'report']

        if not args.types:
            args.types = ['bills', 'legislators', 'votes', 'committees',
                          'alldata']

            if 'events' in metadata['feature_flags']:
                args.types.append('events')

            if 'speeches' in metadata['feature_flags']:
                args.types.append('speeches')

        plan = """billy-update abbr=%s
    actions=%s
    types=%s
    sessions=%s
    terms=%s""" % (args.module, ','.join(args.actions), ','.join(args.types),
                   ','.join(args.sessions), ','.join(args.terms))
        logging.getLogger('billy').info(plan)

        scrape_data = {}

        if 'scrape' in args.actions:
            _clear_scraped_data(args.output_dir)

            # validate then write metadata
            if hasattr(module, 'session_list'):
                session_list = module.session_list()
            else:
                session_list = []
            check_sessions(metadata, session_list)

            try:
                schema_path = os.path.join(os.path.split(__file__)[0],
                                           '../schemas/metadata.json')
                schema = json.load(open(schema_path))

                validator = DatetimeValidator()
                validator.validate(metadata, schema)
            except ValueError as e:
                logging.getLogger('billy').warning(
                    'metadata validation error: ' + str(e))

            run_record = []
            exec_record = {
                "run_record": run_record,
                "args": sys.argv,
            }

            lex = None
            exc_traceback = None

            # start to run scrapers
            exec_start = dt.datetime.utcnow()

            # scraper order matters
            order = ('legislators', 'committees', 'votes', 'bills',
                     'events', 'speeches')
            _traceback = None
            try:
                for stype in order:
                    if stype in args.types:
                        run_record += _run_scraper(stype, args, metadata)
            except Exception as e:
                _traceback = _, _, exc_traceback = sys.exc_info()
                run_record += [{"exception": e, "type": stype}]
                lex = e

            exec_end = dt.datetime.utcnow()
            exec_record['started'] = exec_start
            exec_record['ended'] = exec_end
            scrape_data['scraped'] = exec_record
            scrape_data['abbr'] = abbrev

            for record in run_record:
                if "exception" in record:
                    ex = record['exception']
                    fb = traceback.format_exception(*_traceback)
                    trace = ""
                    for t in fb:
                        trace += t
                    record['exception'] = {
                        "type": ex.__class__.__name__,
                        "message": ex.message,
                        'traceback': trace
                    }
                    scrape_data['failure'] = True
            if lex:
                if 'import' in args.actions:
                    try:
                        db.billy_runs.save(scrape_data, safe=True)
                    except Exception:
                        raise lex, None, exc_traceback
                        # XXX: This should *NEVER* happen, but it has
                        # in the past, so we're going to catch any errors
                        # writing # to pymongo, and raise the original
                        # exception rather then let it look like Mongo's fault.
                        # Thanks for catching this, Thom.
                        #
                        # We lose the stack trace, but the Exception is the
                        # same in every other way.
                        #  -- paultag
                raise

        # imports
        if 'import' in args.actions:
            import_report = _do_imports(abbrev, args)
            scrape_data['imported'] = import_report
            # We're tying the run-logging into the import stage - since import
            # already writes to the DB, we might as well throw this in too.
            db.billy_runs.save(scrape_data, safe=True)

        # reports
        if 'report' in args.actions:
            _do_reports(abbrev, args)

        if 'session-list' in args.actions:
            if hasattr(module, 'session_list'):
                print("\n".join(module.session_list()))
            else:
                raise ScrapeError('session_list() is not defined')

    except ScrapeError as e:
        logging.getLogger('billy').critical('Error: %s', e)
        sys.exit(1)
Beispiel #15
0
    def scrape(self, chamber, session):
        # URL building
        if chamber == 'upper':
            url_chamber_name = 'senate'
            norm_chamber_name = 'Senate'
        else:
            url_chamber_name = 'house'
            norm_chamber_name = 'House'

        assembly_url = '/assembly/%s' % session

        chamber_url = '/bill-text/%s-bill.html' % (url_chamber_name)

        list_url = self.site_root + assembly_url + chamber_url

        # Parsing
        with self.urlopen(list_url) as data:
            soup = self.parser.parse(data)

            if not soup:
                raise ScrapeError('Failed to parse legaslative list page.')

            table = soup.find('table', summary=norm_chamber_name + ' Bills')

            bill_links = table.findAll('a', href=re.compile('bill-actions'))
            indexed_bills = {}

            for link in bill_links:
                # Populate base attributes
                attributes = {
                    'session': session,
                    'chamber': chamber,
                }

                bill_number = link.contents[0]

                if not re.match('^[0-9]{4}$', bill_number):
                    raise ScrapeError('Bill number not in expected format.')

                # ND bill prefixes are coded numerically
                if bill_number[0] == '1':
                    bill_prefix = 'HB'
                elif bill_number[0] == '2':
                    bill_prefix = 'SB'
                elif bill_number[0] == '3':
                    bill_prefix = 'HCR'
                elif bill_number[0] == '4':
                    bill_prefix = 'SCR'
                elif bill_number[0] == '5':
                    bill_prefix = 'HR'
                elif bill_number[0] == '6':
                    bill_prefix = 'SR'
                elif bill_number[0] == '7':
                    bill_prefix = 'HMR'
                elif bill_number[0] == '8':
                    bill_prefix = 'SMR'

                attributes['bill_id'] = bill_prefix + ' ' + bill_number

                # Skip duplicates (bill is listed once for each version)
                if attributes['bill_id'] in indexed_bills.keys():
                    continue

                self.debug(attributes['bill_id'])

                # Parse details page
                attributes.update(
                    self.scrape_bill_details(assembly_url, bill_number))

                # Create bill
                bill = Bill(**attributes)

                # Parse actions
                (actions, actions_url) = self.scrape_bill_actions(
                    assembly_url, bill_number, session)
                bill.add_source(actions_url)

                for action in actions:
                    bill.add_action(**action)

                # Parse versions
                (versions, versions_url) = self.scrape_bill_versions(
                    assembly_url, bill_number)
                bill.add_source(versions_url)

                for version in versions:
                    bill.add_version(**version)

                # Add bill to dictionary, indexed by its id
                indexed_bills[attributes['bill_id']] = bill

            # Parse sponsorship data

            (sponsors, sponsors_url) = self.scrape_bill_sponsors(assembly_url)

            for bill_id, sponsor_list in sponsors.items():
                for sponsor in sponsor_list:
                    # Its possible a bill was misnamed somewhere... but thats
                    # not a good enough reason to error out
                    if bill_id in indexed_bills.keys():
                        bill = indexed_bills[bill_id]
                        bill.add_sponsor(**sponsor)
                        bill.add_source(sponsors_url)

            # Save bill
            for bill in indexed_bills.values():
                self.save_bill(bill)
Beispiel #16
0
 def get_page( self, url ):
     with self.urlopen(url) as html:
         page = lxml.html.fromstring(html)
         return ( page, html )
     raise ScrapeError("Error getting the page. Sorry, man.")
Beispiel #17
0
    def scrape(self, chamber, year):
        """
        Scrape the ND legislators seated in a given chamber during a given year.
        """
        # Error checking
        if year not in metadata['session_details']:
            raise NoDataForPeriod(year)

        # No legislator data for 1997 (though other data is available)
        if year == '1997':
            raise NoDataForPeriod(year)

        # URL building
        if chamber == 'upper':
            url_chamber_name = 'senate'
            norm_chamber_name = 'Senate'
            url_member_name = 'senators'
        else:
            url_chamber_name = 'house'
            norm_chamber_name = 'House'
            url_member_name = 'representatives'

        assembly_url = '/assembly/%i-%s/%s' % (metadata['session_details'][str(
            year)]['number'], year, url_chamber_name)

        list_url = \
            self.site_root + \
            assembly_url + \
            '/members/last-name.html'

        # Parsing
        with self.urlopen(list_url) as data:
            soup = self.parser.parse(data)

            if not soup:
                raise ScrapeError('Failed to parse legaslative list page.')

            header = soup.find('h2')

            if not header:
                raise ScrapeError('Legaslative list header element not found.')

            party_images = {
                '/images/donkey.gif': 'Democrat',
                '/images/elephant.gif': 'Republican'
            }
            for row in header.findNextSibling('table').findAll('tr'):
                cells = row.findAll('td')
                party = party_images[cells[0].img['src']]
                name = map(lambda x: x.strip(),
                           cells[1].a.contents[0].split(', '))
                name.reverse()
                name = ' '.join(name)
                district = re.findall('District (\d+)',
                                      cells[2].contents[0])[0]
                attributes = {
                    'session': year,
                    'chamber': chamber,
                    'district': district,
                    'party': party,
                    'full_name': name,
                }
                split_name = name.split(' ')
                if len(split_name) > 2:
                    attributes['first_name'] = split_name[0]
                    attributes['middle_name'] = split_name[1].strip(' .')
                    attributes['last_name'] = split_name[2]
                else:
                    attributes['first_name'] = split_name[0]
                    attributes['middle_name'] = u''
                    attributes['last_name'] = split_name[1]

                # we can get some more data..
                bio_url = self.site_root + cells[1].a['href']
                try:
                    attributes.update(self.scrape_legislator_bio(bio_url))
                except urllib2.HTTPError:
                    self.log("failed to fetch %s" % bio_url)

                self.debug("attributes: %d", len(attributes))
                self.debug(attributes)
                # Save
                legislator = Legislator(**attributes)
                legislator.add_source(bio_url)
                self.save_legislator(legislator)
Beispiel #18
0
    def parse_bill_xml(self, chamber, session, txt):
        root = lxml.etree.fromstring(txt.bytes)
        bill_id = ' '.join(root.attrib['bill'].split(' ')[1:])
        bill_title = root.findtext("caption")

        if session[2] == 'R':
            session = session[0:2]

        if bill_id[1] == 'B':
            bill_type = ['bill']
        elif bill_id[1] == 'R':
            bill_type = ['resolution']
        elif bill_id[1:3] == 'CR':
            bill_type = ['concurrent resolution']
        elif bill_id[1:3] == 'JR':
            bill_type = ['joint resolution']
        else:
            raise ScrapeError("Invalid bill_id: %s" % bill_id)

        bill = Bill(session, chamber, bill_id, bill_title, type=bill_type)

        versions = root.xpath("//versions")
        for version in versions:
            versionz = version.xpath(".//version")
            for v in versionz:
                description = v.xpath(".//versionDescription")[0].text
                html_url = v.xpath(".//WebHTMLURL")[0].text
                bill.add_version(description, html_url, 'text/html')

        for action in root.findall('actions/action'):
            act_date = datetime.datetime.strptime(action.findtext('date'),
                                                  "%m/%d/%Y").date()

            extra = {}
            extra['action_number'] = action.find('actionNumber').text
            comment = action.find('comment')
            if comment is not None and comment.text:
                extra['comment'] = comment.text.strip()

            actor = {
                'H': 'lower',
                'S': 'upper',
                'E': 'executive'
            }[extra['action_number'][0]]

            desc = action.findtext('description').strip()

            if desc == 'Scheduled for public hearing on . . .':
                continue

            introduced = False

            if desc == 'Amended':
                atype = 'amendment:passed'
            elif desc == 'Amendment(s) offered':
                atype = 'amendment:introduced'
            elif desc == 'Amendment amended':
                atype = 'amendment:amended'
            elif desc == 'Amendment withdrawn':
                atype = 'amendment:withdrawn'
            elif desc == 'Passed' or desc == 'Adopted':
                atype = 'bill:passed'
            elif re.match(r'^Received (by|from) the', desc):
                if 'Secretary of the Senate' not in desc:
                    atype = 'bill:introduced'
                else:
                    atype = 'bill:filed'
            elif desc.startswith('Sent to the Governor'):
                # But what if it gets lost in the mail?
                atype = 'governor:received'
            elif desc.startswith('Signed by the Governor'):
                atype = 'governor:signed'
            elif desc == 'Vetoed by the Governor':
                atype = 'governor:vetoed'
            elif desc == 'Read first time':
                atype = ['bill:introduced', 'bill:reading:1']
                introduced = True
            elif desc == 'Read & adopted':
                atype = ['bill:passed']
                if not introduced:
                    introduced = True
                    atype.append('bill:introduced')
            elif desc == "Passed as amended":
                atype = 'bill:passed'
            elif desc.startswith('Referred to') or desc.startswith(
                    "Recommended to be sent to "):
                atype = 'committee:referred'
            elif desc == "Reported favorably w/o amendment(s)":
                atype = 'committee:passed'
            elif desc == "Filed":
                atype = 'bill:filed'
            elif desc == 'Read 3rd time':
                atype = 'bill:reading:3'
            elif desc == 'Read 2nd time':
                atype = 'bill:reading:2'
            elif desc.startswith('Reported favorably'):
                atype = 'committee:passed:favorable'
            else:
                atype = 'other'

            if 'committee:referred' in atype:
                repls = ['Referred to', "Recommended to be sent to "]
                ctty = desc
                for r in repls:
                    ctty = ctty.replace(r, "").strip()
                extra['committee'] = ctty

            bill.add_action(actor,
                            action.findtext('description'),
                            act_date,
                            type=atype,
                            **extra)

        for author in root.findtext('authors').split(' | '):
            if author != "":
                bill.add_sponsor('author', author)
        for coauthor in root.findtext('coauthors').split(' | '):
            if coauthor != "":
                bill.add_sponsor('coauthor', coauthor)
        for sponsor in root.findtext('sponsors').split(' | '):
            if sponsor != "":
                bill.add_sponsor('sponsor', sponsor)
        for cosponsor in root.findtext('cosponsors').split(' | '):
            if cosponsor != "":
                bill.add_sponsor('cosponsor', cosponsor)

        bill['subjects'] = []
        for subject in root.iterfind('subjects/subject'):
            bill['subjects'].append(subject.text.strip())

        return bill
Beispiel #19
0
    def scrape_details(self, bill_detail_url, session, chamber, bill_id, page):

        data = page

        pat1 = re.compile(r'</FORM>')
        results = pat1.search(data)
        if not results:
            raise ScrapeError("scrape_details(1) - unable to parse |%s|" %
                              bill_detail_url)

        pre_start = page.find("<pre>", results.start())
        if pre_start == -1:
            self.warning(
                "scrape_details(2) - unable to parse (no <pre>) |%s|\n|%s|" %
                (bill_detail_url, page))
            return

        pre_stop = page.find("</pre>", pre_start)
        if pre_stop == -1:
            raise ScrapeError(
                "scrape_details(3) - unable to parse (no </pre>) %s" %
                bill_detail_url)

        pre_section = page[pre_start:pre_stop]

        data = pre_section
        vurl = None

        action_line_re = re.compile(r'(\d\d/\d\d/\d\d)\s+(\w+)\s+(.+)')

        pat2 = re.compile(r' By ')
        results = pat2.search(data)
        if results != None:
            bystuff = data[results.start():results.end()]
            data = data[results.end():]

        pat3 = re.compile(r'</b>')
        results1 = pat3.search(data)

        newspon = []
        if results != None and results1 != None:
            spondata = data[:results1.start()]
            mysponsors = sponsorsToList(spondata)
            for s in mysponsors:
                newspon.append(s)
            data = data[results1.end():]

        apat = re.compile(">(H|S) (\d*)<")
        billpat = re.compile("(\d+)")
        bill_number = billpat.search(bill_id).group(0)

        (similar_bills, summary, after_summary,
         vurl) = self.split_page_into_parts(data, session, bill_number)

        bill_summary = summary.strip().decode('utf8', 'ignore')

        bill = Bill(session,
                    chamber,
                    bill_id,
                    bill_summary,
                    type=bill_type(bill_summary))

        linenum = 0
        for line in after_summary.splitlines():
            #get rid of the parenthesis
            action_line = line.partition("(")[0].strip()
            #r1 = action_line_re.search(action_line)
            r = action_line_re.search(action_line)

            if r:
                the_date = r.group(1)
                action_chamber = r.group(2)
                action = r.group(3)

                date = datetime.datetime.strptime(the_date, "%m/%d/%y")
                date = date.date()

                t = action_type(action)
                if t == ['other']:
                    self.debug("OTHERACTION: bill %s %d Text[%s] line[%s]" %
                               (bill_id, linenum, action, line))
                else:
                    self.debug("ACTION: %s %d dt|ch|action [%s|%s|%s] [%s]" %
                               (bill_id, linenum, the_date, action_chamber,
                                action, str(t)))

                bill.add_action(chamber, action, date, t)
            elif len(line) > 0:
                self.debug("Skipping line %d [%s] line:[%s]" %
                           (linenum, bill_id, line))

            linenum += 1

        if similar_bills:
            bill['similar'] = similar_bills

        bill.add_source(bill_detail_url)

        for sponsor in newspon:
            bill.add_sponsor("sponsor", sponsor)

        if vurl:
            try:
                self.scrape_vote_history(vurl, chamber, bill, bill_id)
                bill.add_source(vurl)
                self.debug("Scraped votes: (chamber=%s,bill=%s,url=%s)" %
                           (chamber, bill_id, vurl))
            except Exception as error:
                self.warning(
                    "Failed to scrape votes: chamber=%s bill=%s vurl=%s %s" %
                    (chamber, bill_id, vurl, traceback.format_exc()))

        self.save_bill(bill)
Beispiel #20
0
    def scrape_bill(self, bill_url, chamber, session):
        with self.urlopen(bill_url) as text:
            if "Specified Bill could not be found" in text:
                return False
            page = lxml.html.fromstring(text)
            page.make_links_absolute(bill_url)

            bill_id = page.xpath("string(//h2)").split()[0]

            summary = page.xpath(
                "string(//*[starts-with(text(), 'Summary: ')])")
            summary = summary.replace('Summary: ', '')

            match = re.match(
                r"^([^:]+): "
                r"((\(Constitutional [aA]mendment\) )?[^(]+)", summary)

            if match:
                subjects = [match.group(1).strip()]
                title = match.group(2).strip()
            else:
                raise ScrapeError("Bad title")

            if bill_id.startswith('SB') or bill_id.startswith('HB'):
                bill_type = ['bill']
            elif bill_id.startswith('SR') or bill_id.startswith('HR'):
                bill_type = ['resolution']
            elif bill_id.startswith('SCR') or bill_id.startswith('HCR'):
                bill_type = ['concurrent resolution']
            else:
                raise ScrapeError("Invalid bill ID format: %s" % bill_id)

            if title.startswith("(Constitutional Amendment)"):
                bill_type.append('constitutional amendment')
                title = title.replace('(Constitutional Amendment) ', '')

            bill = Bill(session,
                        chamber,
                        bill_id,
                        title,
                        subjects=subjects,
                        type=bill_type)
            bill.add_source(bill_url)

            history_link = page.xpath("//a[text() = 'History']")[0]
            history_url = history_link.attrib['href']
            self.scrape_history(bill, history_url)

            authors_link = page.xpath("//a[text() = 'Authors']")[0]
            authors_url = authors_link.attrib['href']
            self.scrape_authors(bill, authors_url)

            try:
                versions_link = page.xpath(
                    "//a[text() = 'Text - All Versions']")[0]
                versions_url = versions_link.attrib['href']
                self.scrape_versions(bill, versions_url)
                for doc in ["Notes", "Digest", "Amendments", "Misc"]:
                    doc_link = page.xpath("//a[text() = '%s']" % doc)[0]
                    doc_url = doc_link.attrib['href']
                    self.scrape_documents(bill, doc_url)
            except IndexError:
                # Only current version
                try:
                    version_link = page.xpath(
                        "//a[text() = 'Text - Current']")[0]
                    version_url = version_link.attrib['href']
                    bill.add_version("%s Current" % bill_id,
                                     version_url,
                                     on_duplicate="use_old")
                except IndexError:
                    # Some bills don't have any versions :(
                    pass

            try:
                votes_link = page.xpath("//a[text() = 'Votes']")[0]
                self.scrape_votes(bill, votes_link.attrib['href'])
            except IndexError:
                # Some bills don't have any votes
                pass

            self.save_bill(bill)

            return True
Beispiel #21
0
    def scrape_vote(self, bill, date, url):
        page = self.urlopen(url)
        page = lxml.html.fromstring(page)

        header = page.xpath("string(//h4[contains(@id, 'hdVote')])")

        if 'No Bill Action' in header:
            self.warning("bad vote header -- skipping")
            return
        location = header.split(', ')[1]

        if location.startswith('House'):
            chamber = 'lower'
        elif location.startswith('Senate'):
            chamber = 'upper'
        else:
            raise ScrapeError("Bad chamber: %s" % chamber)

        committee = ' '.join(location.split(' ')[1:]).strip()
        if not committee or committee.startswith('of Representatives'):
            committee = None

        motion = ', '.join(header.split(', ')[2:]).strip()
        if not motion:
            # If we can't detect a motion, skip this vote
            return

        yes_count = int(page.xpath("string(//td[contains(@id, 'tdAyes')])"))
        no_count = int(page.xpath("string(//td[contains(@id, 'tdNays')])"))
        excused_count = int(
            page.xpath("string(//td[contains(@id, 'tdExcused')])"))
        absent_count = int(
            page.xpath("string(//td[contains(@id, 'tdAbsent')])"))
        other_count = excused_count + absent_count

        passed = yes_count > no_count

        if motion.startswith('Do Pass'):
            type = 'passage'
        elif motion == 'Concurred in amendments':
            type = 'amendment'
        elif motion == 'Veto override':
            type = 'veto_override'
        else:
            type = 'other'

        vote = Vote(chamber, date, motion, passed, yes_count, no_count,
                    other_count)
        vote['type'] = type

        if committee:
            vote['committee'] = committee

        vote.add_source(url)

        for td in page.xpath("//table[contains(@id, 'tblVotes')]/tr/td"):
            if td.text in ('Aye', 'Yea'):
                vote.yes(td.getprevious().text.strip())
            elif td.text == 'Nay':
                vote.no(td.getprevious().text.strip())
            elif td.text in ('Excused', 'Absent'):
                vote.other(td.getprevious().text.strip())

        bill.add_vote(vote)