Esempio n. 1
0
    def scrape_bill(self, chamber, bill):
        bill_id = bill['id'].replace('w/','with ')

        page = lxml.html.fromstring(self.urlopen(bill['url']))
        page.make_links_absolute(bill['url'])

        title_row = page.xpath('//tr[td/b[contains(font,"Long Title")]]')[0]
        # text_content() == make sure any tags in the title don't cause issues
        title = title_row.xpath('td[@width="79%"]/font')[0].text_content() 

        # now we can create a bill object
        b = Bill(bill['session'], bill['chamber'], bill_id, title)
        b.add_source(bill['url'])

        sponsors_row = page.xpath('//tr[td/b[contains(font,"Primary Sponsor")]]')[0]
        sponsor = sponsors_row.xpath('td[@width="31%"]/font')[0].text

        if sponsor != None:
            b.add_sponsor('primary', sponsor)

        # scraping these and co-sponsors, but not doing anything with them until 
        # it's decided whether or not to attempt to split 'em up
        additional = sponsors_row.xpath('td[@width="48%"]/font')
        additional_sponsors = additional[0].text if len(additional) > 0 else ""
        additional_sponsors = additional_sponsors.replace('&nbsp&nbsp&nbsp','')

        cosponsors_row = page.xpath('//tr[td/b[contains(font,"CoSponsors")]]')[0]
        cosponsors = cosponsors_row.xpath('td[@width="79%"]/font')[0].text
        cosponsors = cosponsors if cosponsors != '{ NONE...}' else ''

        introduced_row = page.xpath('//tr[td/b[contains(font,"Introduced On")]]')
        if len(introduced_row) > 0:
            introduced = introduced_row[0].expath('/td[@width="31%"]/font')[0].text
            introduced = datetime.strptime(introduced, '%b %d, %Y')
            b.add_action(bill['chamber'], 'introduced', introduced, 'bill:introduced')

        actions = page.xpath('//table[preceding-sibling::b[contains(font,"Actions History:")]]/tr/td[@width="79%"]/font')
        if len(actions) > 0:
           actions = actions[0].text_content().split('\n') 
           for act in actions:
               act = act.partition(' - ')
               date = datetime.strptime(act[0], '%b %d, %Y')
               b.add_action(bill['chamber'], act[2], date)
        
        # resources = page.xpath('//tr[td/b[contains(font, "Full text of Legislation")]]')

        # save vote urls for scraping later
        vote_urls = []
        voting_reports = page.xpath('//tr[td/b[contains(font, "Voting Reports")]]')
        if(len(voting_reports) > 0):
            for report in voting_reports[0].xpath('td/font/a'):
                vote_urls.append(report.attrib['href'])
        
        # Scrape votes
        for url in vote_urls:
            vote = self.scrape_votes(chamber, title, bill_id, url)
            b.add_vote(vote)

        # Save bill
        self.save_bill(b)
Esempio n. 2
0
    def scrape(self, chamber, session):
        self.validate_session(session)

        if chamber == 'upper':
            other_chamber = 'lower'
            bill_id = 'SB 1'
        else:
            other_chamber = 'upper'
            bill_id = 'HB 1'

        b1 = Bill(session, chamber, bill_id, 'A super bill')
        b1.add_source('http://example.com/')
        b1.add_version('As Introduced', 'http://example.com/SB1.html')
        b1.add_document('Google', 'http://google.com')
        b1.add_sponsor('primary', 'Bob Smith')
        b1.add_sponsor('secondary', 'Johnson, Sally')

        d1 = datetime.datetime.strptime('1/29/2010', '%m/%d/%Y')
        v1 = Vote('upper', d1, 'Final passage', True, 2, 0, 0)
        v1.yes('Smith')
        v1.yes('Johnson')

        d2 = datetime.datetime.strptime('1/30/2010', '%m/%d/%Y')
        v2 = Vote('lower', d2, 'Final passage', False, 0, 1, 1)
        v2.no('Bob Smith')
        v2.other('S. Johnson')

        b1.add_vote(v1)
        b1.add_vote(v2)

        b1.add_action(chamber, 'introduced', d1)
        b1.add_action(chamber, 'read first time', d2)
        b1.add_action(other_chamber, 'introduced', d2)

        self.save_bill(b1)
Esempio n. 3
0
    def scrape_bill(self, chamber, session, bill_id, bill_type):
        url = '%s?r=%s' % (self.base_url, bill_id)
        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)
            # search for Titulo, accent over i messes up lxml, so use 'tulo'
            title = doc.xpath(
                u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()'
            )
            if not title:
                raise NoSuchBill()
            bill = Bill(session, chamber, bill_id, title[0], type=bill_type)
            author = doc.xpath(
                u'//td/b[contains(text(),"Autor")]/../text()')[0]
            for aname in author.split(','):
                bill.add_sponsor('primary', self.clean_name(aname).strip())
            co_authors = doc.xpath(
                u'//td/b[contains(text(),"Co-autor")]/../text()')
            if len(co_authors) != 0:
                for co_author in co_authors[1].split(','):
                    bill.add_sponsor('cosponsor',
                                     self.clean_name(co_author).strip())
            action_table = doc.xpath('//table')[-1]
            for row in action_table[1:]:
                tds = row.xpath('td')
                # ignore row missing date
                if len(tds) != 2:
                    continue
                date = datetime.datetime.strptime(tds[0].text_content(),
                                                  "%m/%d/%Y")
                action = tds[1].text_content().strip()
                #parse the text to see if it's a new version or a unrelated document
                #if has - let's *shrug* assume it's a vote document

                #get url of action
                action_url = tds[1].xpath('a/@href')
                atype, action = self.parse_action(chamber, bill, action,
                                                  action_url, date)
                if atype == 'bill:passed' and action_url:
                    vote_chamber = None
                    for pattern, vote_chamber in _voteChambers:
                        if re.match(pattern, action):
                            break

                    else:
                        self.warning('coudnt find voteChamber pattern')

                    if vote_chamber == 'lower' and len(action_url) > 0:
                        vote = self.scrape_votes(action_url[0], action, date,
                                                 vote_chamber)
                        if not vote[0] == None:
                            vote[0].add_source(action_url[0])
                            bill.add_vote(vote[0])
                        else:
                            self.warning('Problem Reading vote: %s,%s' %
                                         (vote[1], bill_id))

            bill.add_source(url)
            self.save_bill(bill)
Esempio n. 4
0
    def scrape_bill(self, chamber, session, bill_id, bill_type):
        url = '%s?r=%s' % (self.base_url, bill_id)
        html = self.urlopen(url)
        if "error '80020009'" in html:
            self.warning('asp error on page, skipping %s', bill_id)
            return
        doc = lxml.html.fromstring(html)
        # search for Titulo, accent over i messes up lxml, so use 'tulo'
        title = doc.xpath(u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()')
        if not title:
            raise NoSuchBill()
        bill = Bill(session, chamber, bill_id, title[0], type=bill_type)
        author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0]
        for aname in author.split(','):
            aname = self.clean_name(aname).strip()
            if aname:
                bill.add_sponsor('primary', aname)
        co_authors = doc.xpath(u'//td/b[contains(text(),"Co-autor")]/../text()')
        if len(co_authors) != 0:
            for co_author in co_authors[1].split(','):
                bill.add_sponsor('cosponsor', self.clean_name(co_author).strip());
        action_table = doc.xpath('//table')[-1]
        for row in action_table[1:]:
            tds = row.xpath('td')
            # ignore row missing date
            if len(tds) != 2:
                continue
            if tds[0].text_content():
                date = datetime.datetime.strptime(tds[0].text_content(), "%m/%d/%Y")
            action = tds[1].text_content().strip()
            #parse the text to see if it's a new version or a unrelated document
            #if has - let's *shrug* assume it's a vote document

            #get url of action
            action_url = tds[1].xpath('a/@href')
            atype,action = self.parse_action(chamber,bill,action,action_url,date)
            if atype == 'bill:passed' and action_url:
                vote_chamber  = None
                for pattern, vote_chamber in _voteChambers:
                   if re.match(pattern,action):
                       break

                else:
                   self.warning('coudnt find voteChamber pattern')

                if vote_chamber == 'lower' and len(action_url) > 0:
                    vote = self.scrape_votes(action_url[0], action,date,
                                             vote_chamber)
                    if not vote[0] == None:
                        vote[0].add_source(action_url[0])
                        bill.add_vote(vote[0])
                    else:
                        self.warning('Problem Reading vote: %s,%s' %
                                     (vote[1], bill_id))

        bill.add_source(url)
        self.save_bill(bill)
Esempio n. 5
0
    def scrape_current(self, chamber, term):
        chamber_name = "Senate" if chamber == "upper" else "House"
        with self.urlopen(
            ksapi.url + "bill_status/"
        ) as bill_request:  # perhaps we should save this data so we can make on request for both chambers?
            bill_request_json = json.loads(bill_request)
            bills = bill_request_json["content"]
            for bill_data in bills:
                # filtering out other chambers
                bill_equal_chamber = False
                for history in bill_data["HISTORY"]:
                    if history["chamber"] == chamber_name:
                        bill_is_in_chamber = True
                if not bill_is_in_chamber:
                    continue

                    # main
                bill = Bill(term, chamber, bill_data["BILLNO"], bill_data["SHORTTITLE"])
                bill.add_source(ksapi.url + "bill_status/" + bill_data["BILLNO"].lower())
                if bill_data["LONGTITLE"]:
                    bill.add_title(bill_data["LONGTITLE"])
                bill.add_document("apn", ksapi.ksleg + bill_data["apn"])
                bill.add_version("Latest", ksapi.ksleg + bill_data["apn"])

                for sponsor in bill_data["SPONSOR_NAMES"]:
                    bill.add_sponsor("primary" if len(bill_data["SPONSOR_NAMES"]) == 1 else "cosponsor", sponsor)

                for event in bill_data["HISTORY"]:
                    if "committee_names" in event and "conferee_names" in event:
                        actor = " and ".join(bill_data["committee_names"] + bill_data["conferee_names"])
                    elif "committee_names" in history:
                        actor = " and ".join(bill_data["committee_names"])
                    elif "conferee_names" in history:
                        actor = " and ".join(bill_data["conferee_names"])
                    else:
                        actor = "upper" if chamber == "Senate" else "lower"

                    date = datetime.datetime.strptime(event["occurred_datetime"], "%Y-%m-%dT%H:%M:%S")
                    bill.add_action(actor, event["status"], date)

                    if event["action_code"] in ksapi.voted:
                        votes = votes_re.match(event["status"])
                        if votes:
                            vote = Vote(
                                chamber,
                                date,
                                votes.group(1),
                                event["action_code"] in ksapi.passed,
                                int(votes.group(2)),
                                int(votes.group(3)),
                                0,
                            )
                            vote.add_source(ksapi.ksleg + "bill_status/" + bill_data["BILLNO"].lower())
                            bill.add_vote(vote)

                self.save_bill(bill)
Esempio n. 6
0
    def scrape_bill(self, chamber, session, bill_id, bill_type):
        url = "%s?r=%s" % (self.base_url, bill_id)
        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)
            # search for Titulo, accent over i messes up lxml, so use 'tulo'
            title = doc.xpath(u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()')
            if not title:
                raise NoSuchBill()
            bill = Bill(session, chamber, bill_id, title[0], type=bill_type)
            author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0]
            for aname in author.split(","):
                bill.add_sponsor("primary", self.clean_name(aname).strip())
            co_authors = doc.xpath(u'//td/b[contains(text(),"Co-autor")]/../text()')
            if len(co_authors) != 0:
                for co_author in co_authors[1].split(","):
                    bill.add_sponsor("cosponsor", self.clean_name(co_author).strip())
            action_table = doc.xpath("//table")[-1]
            for row in action_table[1:]:
                tds = row.xpath("td")
                # ignore row missing date
                if len(tds) != 2:
                    continue
                date = datetime.datetime.strptime(tds[0].text_content(), "%m/%d/%Y")
                action = tds[1].text_content().strip()
                # parse the text to see if it's a new version or a unrelated document
                # if has - let's *shrug* assume it's a vote document

                # get url of action
                action_url = tds[1].xpath("a/@href")
                atype, action = self.parse_action(chamber, bill, action, action_url, date)
                if atype == "bill:passed" and action_url:
                    vote_chamber = None
                    for pattern, vote_chamber in _voteChambers:
                        if re.match(pattern, action):
                            break

                    else:
                        self.warning("coudnt find voteChamber pattern")

                    if vote_chamber == "lower" and len(action_url) > 0:
                        vote = self.scrape_votes(action_url[0], action, date, vote_chamber)
                        if not vote[0] == None:
                            vote[0].add_source(action_url[0])
                            bill.add_vote(vote[0])
                        else:
                            self.warning("Problem Reading vote: %s,%s" % (vote[1], bill_id))

            bill.add_source(url)
            self.save_bill(bill)
Esempio n. 7
0
    def scrape(self, chamber, session):
        self.all_bills = {}
        self.slug = self.metadata['session_details'][session]['slug']

        page = self.lxmlize(self.bill_directory_url.format(self.slug.upper()))
        page.make_links_absolute(self.base_url)

        ulid = 'senateBills' if chamber == 'upper' else 'houseBills'  # id of <ul>
        header = page.xpath("//ul[@id='{0}_search']".format(ulid))[0]

        #Every ul with a data-load-action and an id
        bill_list_pages = header.xpath(".//ul[boolean(@data-load-action)"
                                       " and boolean(@id)]/@data-load-action")

        bill_anchors = []

        for bill_list_url in bill_list_pages:
            bill_list_page = self.lxmlize('{}{}'.format(self.base_url, bill_list_url))
            bill_list_page.make_links_absolute(self.base_url)
            bill_anchors.extend(bill_list_page.xpath('//a') or [])

        ws = re.compile(r"\s+")

        def _clean_ws(txt):
            """Remove extra whitespace from text."""
            return ws.sub(' ', txt).strip()

        for a in bill_anchors:
            bid = ws.sub('', a.text_content())  # bill id
            bill_summary = _clean_ws(a.get('title'))
            # bill title is added below
            bill = Bill(session, chamber, bid, title='', summary=bill_summary)
            page = self.lxmlize(a.get('href'))
            versions = page.xpath('//ul[@class="dropdown-menu"]/li/span/' +
                                  'a[contains(@title, "Get the Pdf")]/@href')

            measure_info = {}
            info = page.xpath("//table[@id='measureOverviewTable']/tr")
            for row in info:
                key, value = row.xpath("./*")
                key = key.text.replace(':','').strip()
                measure_info[key] = value

            for sponsor in measure_info['Chief Sponsors'].xpath("./a"):
                if sponsor.text_content().strip():
                    bill.add_sponsor(
                            type='primary', name=sponsor.text_content())

            for sponsor in measure_info['Regular Sponsors'].xpath("./a"):
                if sponsor.text_content().strip():
                    bill.add_sponsor(
                            type='cosponsor', name=sponsor.text_content())

            title = _clean_ws(measure_info['Bill Title'].text_content())
            # some bill titles need to be added manually
            if self.slug == "2013R1" and bid == "HB2010":
                title = ("Relating to Water Resources Department contested"
                         "case proceedings.")
            bill['title'] = title

            for version in versions:
                name = version.split("/")[-1]
                bill.add_version(name=name, url=version,
                                 mimetype='application/pdf')

            history_url = self.create_url('Measures/Overview/GetHistory/{bill}', bid)
            history = self.lxmlize(history_url).xpath("//table/tr")
            for entry in history:
                wwhere, action = [_clean_ws(x.text_content())
                                  for x in entry.xpath("*")]
                vote_cleaning_re = r'(.*?)((Ayes)|(Nays),\s.*)'
                if re.match(vote_cleaning_re, action):
                    action = re.search(vote_cleaning_re, action).groups()[0]
                wwhere = re.match(
                    r"(?P<when>.*) \((?P<where>.*)\)", wwhere).groupdict()

                action_chamber = {"S": "upper", "H": "lower"}[wwhere['where']]
                when = "%s-%s" % (self.slug[:4], wwhere['when'])
                when = dt.datetime.strptime(when, "%Y-%m-%d")

                types = []
                for expr, types_ in self.action_classifiers:
                    m = re.match(expr, action)
                    if m:
                        types += types_

                if types == []:
                    types = ['other']

                # actor, action, date, type, committees, legislators
                bill.add_action(action_chamber, action, when, type=types)

                # Parse and store Vote information
                vote_id = entry.xpath('./td/a[contains(@href, "otes-")]/@href')
                if not vote_id:
                    continue
                elif "#measureVotes-" in vote_id[0]:
                    vote_id = vote_id[0].split("-")[-1]
                    vote_url = "https://olis.leg.state.or.us/liz/" + \
                            "{0}/Measures/MeasureVotes?id={1}". \
                            format(self.slug, vote_id)
                else:
                    vote_id = vote_id[0].split("-")[-1]
                    vote_url = "https://olis.leg.state.or.us/liz/" + \
                            "{0}/CommitteeReports/MajorityReport/{1}". \
                            format(self.slug, vote_id)

                votes = self._get_votes(vote_url)
                if not any(len(x) for x in votes.values()):
                    self.warning("The votes webpage was empty for " +
                            "action {0} on bill {1}.".format(action, bid))
                    continue

                passed = (
                        float(len(votes["yes_votes"])) /
                        (len(votes["yes_votes"]) + len(votes["no_votes"]))
                        > 0.5
                        )

                vote = Vote(
                        chamber=chamber,
                        date=when,
                        motion=action,
                        passed=passed,
                        yes_count=len(votes["yes_votes"]),
                        no_count=len(votes["no_votes"]),
                        other_count=len(votes["other_votes"]),

                        session=session,
                        bill_id=bid,
                        bill_chamber=action_chamber
                        )

                vote.update(votes)
                bill_url = "https://olis.leg.state.or.us/liz/" + \
                        "{0}/Measures/Overview/{1}".format(self.slug, bid)
                vote.add_source(bill_url)

                bill.add_vote(vote)

            amendments_url = self.create_url(
                    'Measures/ProposedAmendments/{bill}', bid)
            amendments = self.lxmlize(amendments_url).xpath(
                    "//div[@id='amendments']/table//tr")

            for amendment in amendments:
                nodes = amendment.xpath("./td")

                if nodes == []:
                    continue

                pdf_href, date, committee, adopted, when = nodes
                pdf_href, = pdf_href.xpath("./a")
                pdf_link = pdf_href.attrib['href']

                name = "Ammendment %s" % (pdf_href.text_content())

                adopted = adopted.text
                bill.add_document(name=name, url=pdf_link,
                                  adopted=adopted,
                                  mimetype='application/pdf')

            bill.add_source(a.get('href'))
            self.save_bill(bill)
Esempio n. 8
0
class AssemblyBillPage(object):
    '''Get the actions, sponsors, sponsors memo and summary
    and assembly floor votes from the assembly page.
    '''

    def __init__(self, scraper, session, chamber, url, doc, bill_type,
                 bill_id, title, bill_id_parts):
        self.scraper = scraper
        self.chamber = chamber
        self.url = url
        self.doc = doc
        self.bill_id = bill_id
        self.letter, self.number, self.version = bill_id_parts
        self.data = {}
        self.bill = Bill(session, chamber, bill_id, title, type=bill_type)
        self.succeeded = False

        self._build()

    def _build(self):
        if not self.doc.xpath('//pre/text()'):
            return
        self.get_actions()
        self.get_sponsors_memo()
        self.get_sponsors()
        self.get_summary()
        self.get_companions()
        self.get_lower_votes()
        self.get_version()
        self.succeeded = True
        self.bill.add_source(self.url)

    def _get_chunks(self):
        if 'summary' not in self.data:
            url = ('http://assembly.state.ny.us/leg/?default_fld=&'
                   'bn=%s&Summary=Y&Actions=Y') % self.bill_id
            doc = self.url2lxml(url)
            summary, actions = doc.xpath('//pre/text()')
            self.data['summary'], self.data['actions'] = summary, actions
            return summary, actions
        else:
            return self.data['summary'], self.data['actions']

    def url2lxml(self, url):
        self.bill.add_source(url)
        return self.scraper.url2lxml(url)

    def get_version(self):
        url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn=' + self.bill_id
        version = self.bill_id
        self.bill.add_version(version, url, mimetype='text/html')

    def get_companions(self):
        summary, _ = self._get_chunks()
        chunks = summary.split('\n\n')
        for chunk in chunks:
            if chunk.startswith('SAME AS'):
                companions = chunk.replace('SAME AS    ', '')
                if companions != 'No same as':
                    for companion in re.split(r'\s*[\,\\]\s*', companions):
                        companion = re.sub(r'^Same as ', '', companion)
                        companion = re.sub(r'^Uni', '', companion)
                        companion = re.sub(r'\-\w+$', '', companion)
                        self.bill.add_companion(companion)

    def get_sponsors_memo(self):
        if self.chamber == 'lower':
            url = ('http://assembly.state.ny.us/leg/?'
                   'default_fld=&bn=%s&term=&Memo=Y') % self.bill_id
            self.bill.add_document("Sponsor's Memorandum", url)

    def get_summary(self):
        summary, _ = self._get_chunks()
        chunks = summary.split('\n\n')
        self.bill['summary'] = chunks[-1]

    def _scrub_name(self, name):
        junk = [
            r'^Rules\s+',
            '\(2nd Vice Chairperson\)',
            '\(MS\)',
            'Assemblyman',
            'Assemblywoman',
            'Senator']
        for rgx in junk:
            name = re.sub(rgx, '', name, re.I)
        return name.strip('(), ')

    def get_sponsors(self):
        summary, _ = self._get_chunks()
        chunks = summary.split('\n\n')
        for chunk in chunks:
            for sponsor_type in ('SPONSOR', 'COSPNSR', 'MLTSPNSR'):
                if chunk.startswith(sponsor_type):
                    _, data = chunk.split(' ', 1)
                    for sponsor in re.split(r',\s+', data.strip()):

                        if not sponsor:
                            continue

                        # If it's a "Rules" bill, add the Rules committee
                        # as the primary.
                        if sponsor.startswith('Rules'):
                            self.bill.add_sponsor('primary', 'Rules Committee',
                                                  chamber='lower')

                        sponsor = self._scrub_name(sponsor)

                        # Figure out sponsor type.
                        spons_swap = {'SPONSOR': 'primary'}
                        _sponsor_type = spons_swap.get(
                                            sponsor_type, 'cosponsor')

                        self.bill.add_sponsor(_sponsor_type, sponsor.strip(),
                                         official_type=sponsor_type)

    def get_actions(self):
        _, actions = self._get_chunks()
        categorizer = self.scraper.categorizer
        actions_rgx = r'(\d{2}/\d{2}/\d{4})\s+(.+)'
        actions_data = re.findall(actions_rgx, actions)
        for date, action in actions_data:
            date = datetime.datetime.strptime(date, r'%m/%d/%Y')
            act_chamber = ('upper' if action.isupper() else 'lower')
            types, attrs = categorizer.categorize(action)
            self.bill.add_action(act_chamber, action, date, type=types, **attrs)
            # Bail if the bill has been substituted by another.
            if 'substituted by' in action:
                return

    def get_lower_votes(self):

        url = ('http://assembly.state.ny.us/leg/?'
               'default_fld=&bn=%s&term=&Votes=Y')
        doc = self.url2lxml(url % self.bill_id)
        if doc is None:
            return

        pre = doc.xpath('//pre')[0].text_content()
        no_votes = ('There are no votes for this bill in this '
                    'legislative session.')
        if pre == no_votes:
            return

        actual_vote = collections.defaultdict(list)
        for table in doc.xpath('//table'):

            date = table.xpath('caption/label[contains(., "DATE:")]')
            date = date[0].itersiblings().next().text
            date = datetime.datetime.strptime(date, '%m/%d/%Y')

            votes = table.xpath('caption/span/label[contains(., "YEA/NAY:")]')
            votes = votes[0].itersiblings().next().text
            yes_count, no_count = map(int, votes.split('/'))

            passed = yes_count > no_count
            vote = Vote('lower', date, 'Floor Vote', passed, yes_count,
                        no_count, other_count=0)

            tds = table.xpath('tr/td/text()')
            votes = iter(tds)
            while True:
                try:
                    data = list(islice(votes, 2))
                    name, vote_val = data
                except (StopIteration, ValueError):
                    # End of data. Stop.
                    break
                name = self._scrub_name(name)

                if vote_val.strip() == 'Y':
                    vote.yes(name)
                elif vote_val.strip() in ('N', 'NO'):
                    vote.no(name)
                else:
                    vote.other(name)
                    actual_vote[vote_val].append(name)

            # The page doesn't provide an other_count.
            vote['other_count'] = len(vote['other_votes'])
            vote['actual_vote'] = actual_vote
            self.bill.add_vote(vote)
Esempio n. 9
0
    def _scrape_bill(self, session, bill_data):
        details = self._parse_bill_details(bill_data)

        (senate_url, assembly_url, bill_chamber, bill_type, bill_id, title,
         (prefix, number, active_version)) = details

        bill = Bill(session,
                    bill_chamber,
                    bill_id,
                    title,
                    type=bill_type,
                    summary=bill_data['summary'])

        if bill_data['title'] is None:
            bill['title'] = bill_data['summary']

        bill_active_version = bill_data['amendments']['items'][active_version]

        # Parse sponsors.
        if bill_data['sponsor']['rules'] == True:
            bill.add_sponsor('primary',
                             'Rules Committee',
                             chamber=bill_chamber)
        elif not bill_data['sponsor']['budget']:
            primary_sponsor = bill_data['sponsor']['member']
            bill.add_sponsor('primary', primary_sponsor['shortName'])

            # There *shouldn't* be cosponsors if there is no sponsor.
            cosponsors = bill_active_version['coSponsors']['items']
            for cosponsor in cosponsors:
                bill.add_sponsor('cosponsor', cosponsor['shortName'])

        # List companion bill.
        same_as = bill_active_version.get('sameAs', {})
        # Check whether "sameAs" property is populated with at least one bill.
        if same_as['items']:
            # Get companion bill ID.
            companion_bill_id = same_as['items'][0]['basePrintNo']

            # Build companion bill session.
            start_year = same_as['items'][0]['session']
            end_year = start_year + 1
            companion_bill_session = '-'.join([str(start_year), str(end_year)])

            # Determine companion bill chamber.
            companion_bill_prefix = self._parse_bill_number(
                same_as['items'][0]['basePrintNo'])[0]
            companion_bill_chamber = self._parse_bill_prefix(
                companion_bill_prefix)[0]

            # Attach companion bill data.
            bill.add_companion(
                companion_bill_id,
                companion_bill_session,
                companion_bill_chamber,
            )

        # Parse actions.
        chamber_map = {
            'senate': 'upper',
            'assembly': 'lower',
        }

        for action in bill_data['actions']['items']:
            chamber = chamber_map[action['chamber'].lower()]
            action_datetime = datetime.datetime.strptime(
                action['date'], '%Y-%m-%d')
            action_date = action_datetime.date()
            types, attrs = NYBillScraper.categorizer.categorize(action['text'])

            bill.add_action(chamber,
                            action['text'],
                            action_date,
                            type=types,
                            **attrs)

        # Chamber-specific processing.
        if bill_chamber == 'upper':
            # Collect votes.
            for vote_data in bill_data['votes']['items']:
                vote = self._parse_senate_votes(vote_data)
                bill.add_vote(vote)
        elif bill_chamber == 'lower':
            assembly = AssemblyBillPage(self, session, bill, details)
            assembly.build()
            assembly_bill_data = assembly.bill

        # A little strange the way it works out, but the Assembly
        # provides the HTML version documents and the Senate provides
        # the PDF version documents.
        amendments = bill_data['amendments']['items']
        for key, amendment in amendments.iteritems():
            version = amendment['printNo']

            html_version = version + ' HTML'
            html_url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn='\
                '{}&term={}'.format(bill_id, self.term_start_year)
            bill.add_version(html_version, html_version, mimetype='text/html')

            pdf_version = version + ' PDF'
            pdf_url = 'http://legislation.nysenate.gov/pdf/bills/{}/{}'\
                .format(self.term_start_year, bill_id)
            bill.add_version(pdf_version,
                             pdf_version,
                             mimetype='application/pdf')

        # Handling of sources follows. Sources serving either chamber
        # maintain duplicate data, so we can see certain bill data
        # through either chamber's resources. However, we have to refer
        # to a specific chamber's resources if we want to grab certain
        # specific information such as vote data.
        #
        # As such, I'm placing all potential sources in the interest of
        # thoroughness. - Andy Lo

        # List Open Legislation API endpoint as a source.
        bill.add_source(self.api_client.root + self.api_client.\
            resources['bill'].format(
                session_year=session,
                bill_id=bill_id,
                summary='',
                detail=''))
        bill.add_source(senate_url)
        bill.add_source(assembly_url)

        return bill
Esempio n. 10
0
    def bill_info(self, bill_link, session, main_url, bill_page):

        bill_page = lxml.html.fromstring(bill_page)

        #basic info
        try:
            long_title = bill_page.xpath('//div[@id="content_text"]/h2')[0].text.split()
        except IndexError:
            return None
        bill_id = long_title[0]
        title = ''
        for x in range(2, len(long_title)):
            title += long_title[x] + ' '
        title = title[0:-1]

        #bill_type
        bill_type = 'resolution' if 'LR' in bill_id else 'bill'

        bill = Bill(session, 'upper', bill_id, title, type = bill_type)

        #sources
        bill.add_source(main_url)
        bill.add_source(bill_link)

        #Sponsor
        introduced_by = bill_page.xpath('//div[@id="content_text"]/div[2]/table/tr[2]/td[1]/a[1]')[0].text
        bill.add_sponsor('primary', introduced_by)

        #actions
        for actions in bill_page.xpath('//div[@id="content_text"]/div[3]/table/tr[1]/td[1]/table/tr'):
            date = actions[0].text
            if 'Date' not in date:
                date = datetime.strptime(date, '%b %d, %Y')
                action = actions[1].text

                if '-' in action:
                    vote_info = action.split()[-1].split('-')
                    yes_count = int(vote_info[0])
                    no_count = int(vote_info[1])
                    abstention_count = int(vote_info[2])
                    passed = True if ( yes_count > no_count) else False

                    vote = Vote('upper', date, action, passed, yes_count, no_count, abstention_count)
                    vote.add_source(bill_link)
                    bill.add_vote(vote)

                if 'Governor' in action:
                    actor = 'Governor'
                elif 'Speaker' in action:
                    actor = 'Speaker'
                else:
                    actor = 'upper'

                action_type = self.action_types(action)
                bill.add_action(actor, action, date, action_type)

        #versions
        for versions in bill_page.xpath('//div[@id="content_text"]/div[2]/table/tr[2]/td[2]/a'):
            version_url = versions.attrib['href']
            version_url = 'http://nebraskalegislature.gov/' + version_url[3:len(version_url)]
            version_name = versions.text
            bill.add_version(version_name, version_url)


        #documents
        #additional_info
        for additional_info in bill_page.xpath('//div[@id="content_text"]/div[2]/table/tr[2]/td/a'):
            document_name = additional_info.text
            document_url = additional_info.attrib['href']
            document_url = 'http://nebraskalegislature.gov/' + document_url[3:len(document_url)]
            if '.pdf' in document_url:
                bill.add_document(document_name, document_url)

        #amendments
        for admendments in bill_page.xpath('//div[@id="content_text"]/div[3]/table/tr[1]/td[2]/table/tr/td/a'):
            admendment_name = admendments.text
            admendment_url = admendments.attrib['href']
            admendment_url = 'http://nebraskalegislature.gov/' + admendment_url[3:len(admendment_url)]
            bill.add_document(admendment_name, admendment_url)

        #related transcripts
        for transcripts in bill_page.xpath('//div[@id="content_text"]/div[3]/table/tr[2]/td[2]/a'):
            transcript_name = transcripts.text
            transcript_url = transcripts.attrib['href']
            bill.add_document(transcript_name, transcript_url)

        self.save_bill(bill)
Esempio n. 11
0
    def scrape_bills(self, session, year_abr):
        #Main Bill information
        main_bill_csv = self.access_to_csv('MainBill')

        # keep a dictionary of bills (mapping bill_id to Bill obj)
        bill_dict = {}

        for rec in main_bill_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["Synopsis"]
            if bill_type[0] == 'A':
                chamber = "lower"
            else:
                chamber = "upper"

            # some bills have a blank title.. just skip it
            if not title:
                continue

            bill = Bill(str(session), chamber, bill_id, title,
                        type=self._bill_types[bill_type[1:]])
            if rec['IdenticalBillNumber'].strip():
                bill.add_companion(rec['IdenticalBillNumber'].split()[0])

            # TODO: last session info is in there too
            bill_dict[bill_id] = bill

        #Sponsors
        bill_sponsors_csv = self.access_to_csv('BillSpon')

        for rec in bill_sponsors_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in sponsor database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            name = rec["Sponsor"]
            sponsor_type = rec["Type"]
            if sponsor_type == 'P':
                sponsor_type = "primary"
            else:
                sponsor_type = "cosponsor"
            bill.add_sponsor(sponsor_type, name)


        #Documents
        bill_document_csv = self.access_to_csv('BillWP')

        for rec in bill_document_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in document database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            document = rec["Document"]
            document = document.split('\\')
            document = document[-2] + "/" + document[-1]
            year = str(year_abr) + str((year_abr + 1))

            #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document)
            htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (year_abr,
                document.replace('.DOC', '.HTM'))

            # name document based _doctype
            try:
                doc_name = self._doctypes[rec['DocType']]
            except KeyError:
                raise Exception('unknown doctype %s on %s' %
                                (rec['DocType'], bill_id))
            if rec['Comment']:
                doc_name += ' ' + rec['Comment']

            if rec['DocType'] in self._version_types:
                # Clean HTMX links.
                if htm_url.endswith('HTMX'):
                    htm_url = re.sub('X$', '', htm_url)

                if htm_url.endswith('HTM'):
                    mimetype = 'text/html'
                elif htm_url.endswith('wpd'):
                    mimetype = 'application/vnd.wordperfect'
                bill.add_version(doc_name, htm_url, mimetype=mimetype)
            else:
                bill.add_document(doc_name, htm_url)

        # Votes
        next_year = int(year_abr)+1
        vote_info_list = ['A%s' % year_abr,
                          'A%s' % next_year,
                          'S%s' % year_abr,
                          'S%s' % next_year,
                          'CA%s-%s' % (year_abr, next_year),
                          'CS%s-%s' % (year_abr, next_year),
                         ]

        for filename in vote_info_list:
            s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename
            try:
                s_vote_zip, resp = self.urlretrieve(s_vote_url)
            except scrapelib.FTPError:
                self.warning('could not find %s' % s_vote_url)
                continue
            zipedfile = zipfile.ZipFile(s_vote_zip)
            for vfile in ["%s.txt" % (filename), "%sEnd.txt" % (filename)]:
                try:
                    vote_file = zipedfile.open(vfile, 'U')
                except KeyError:
                    #
                    # Right, so, 2011 we have an "End" file with more
                    # vote data than was in the original dump.
                    #
                    self.warning("No such file: %s" % (vfile))
                    continue

                vdict_file = csv.DictReader(vote_file)

                votes = {}
                if filename.startswith('A') or filename.startswith('CA'):
                    chamber = "lower"
                else:
                    chamber = "upper"

                if filename.startswith('C'):
                    vote_file_type = 'committee'
                else:
                    vote_file_type = 'chamber'

                for rec in vdict_file:

                    if vote_file_type == 'chamber':
                        bill_id = rec["Bill"].strip()
                        leg = rec["Full_Name"]

                        date = rec["Session_Date"]
                        action = rec["Action"]
                        leg_vote = rec["Legislator_Vote"]
                    else:
                        bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number'])
                        leg = rec['Name']
                        # drop time portion
                        date = rec['Agenda_Date'].split()[0]
                        # make motion readable
                        action = self._com_vote_motions[rec['BillAction']]
                        # first char (Y/N) use [0:1] to ignore ''
                        leg_vote = rec['LegislatorVote'][0:1]

                    date = datetime.strptime(date, "%m/%d/%Y")
                    vote_id = '_'.join((bill_id, chamber, action))
                    vote_id = vote_id.replace(" ", "_")

                    if vote_id not in votes:
                        votes[vote_id] = Vote(chamber, date, action, None, None,
                                              None, None, bill_id=bill_id)
                    if vote_file_type == 'committee':
                        votes[vote_id]['committee'] = self._committees[
                            rec['Committee_House']]

                    if leg_vote == "Y":
                        votes[vote_id].yes(leg)
                    elif leg_vote == "N":
                        votes[vote_id].no(leg)
                    else:
                        votes[vote_id].other(leg)

            # remove temp file
            os.remove(s_vote_zip)

            #Counts yes/no/other votes and saves overall vote
            for vote in votes.itervalues():
                vote_yes_count = len(vote["yes_votes"])
                vote_no_count = len(vote["no_votes"])
                vote_other_count = len(vote["other_votes"])
                vote["yes_count"] = vote_yes_count
                vote["no_count"] = vote_no_count
                vote["other_count"] = vote_other_count

                # Veto override.
                if vote['motion'] == 'OVERRIDE':
                    # Per the NJ leg's glossary, a veto override requires
                    # 2/3ds of each chamber. 27 in the senate, 54 in the house.
                    # http://www.njleg.state.nj.us/legislativepub/glossary.asp
                    vote['passed'] = False
                    if vote['chamber'] == 'lower':
                        if vote_yes_count >= 54:
                            vote['passed'] = True
                    elif vote['chamber'] == 'upper':
                        if vote_yes_count >= 27:
                            vote['passed'] = True

                # Regular vote.
                elif vote_yes_count > vote_no_count:
                    vote["passed"] = True
                else:
                    vote["passed"] = False
                vote_bill_id = vote["bill_id"]
                bill = bill_dict[vote_bill_id]
                bill.add_vote(vote)

        #Actions
        bill_action_csv = self.access_to_csv('BillHist')
        actor_map = {'A': 'lower', 'G': 'executive', 'S': 'upper'}

        for rec in bill_action_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in action database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            action = rec["Action"]
            date = rec["DateAction"]
            date = datetime.strptime(date, "%m/%d/%y %H:%M:%S")
            actor = actor_map[rec["House"]]
            comment = rec["Comment"]
            action, atype = self.categorize_action(action, bill_id)
            if comment:
                action += (' ' + comment)
            bill.add_action(actor, action, date, type=atype)

        # Subjects
        subject_csv = self.access_to_csv('BillSubj')
        for rec in subject_csv:
            bill_id = rec['BillType'].strip() + str(int(rec['BillNumber']))
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in subject database' % bill_id)
                continue
            bill = bill_dict.get(bill_id)
            if bill:
                bill.setdefault('subjects', []).append(rec['SubjectKey'])
            else:
                self.warning('invalid bill id in BillSubj: %s' % bill_id)

        phony_bill_count = 0
        # save all bills at the end
        for bill in bill_dict.itervalues():
            # add sources
            if not bill['actions'] and not bill['versions']:
                self.warning('probable phony bill detected %s',
                             bill['bill_id'])
                phony_bill_count += 1
            else:
                bill.add_source('http://www.njleg.state.nj.us/downloads.asp')
                self.save_bill(bill)

        if phony_bill_count:
            self.warning('%s total phony bills detected', phony_bill_count)
Esempio n. 12
0
    def scrape_bill(self, chamber, session, bill_id, short_title=None):
        """
        Scrapes documents, actions, vote counts and votes for
        bills from the 2009 session and above.
        """
        url = BILL_URL % (session, bill_id.replace(' ', ''))
        bill_page = self.get(url).text
        html = lxml.html.fromstring(bill_page)
        html.make_links_absolute('http://legislature.idaho.gov/legislation/%s/' % session)
        bill_tables = html.xpath('//table[contains(@class, "bill-table")]')
        title = bill_tables[1].text_content().strip()
        bill_type = get_bill_type(bill_id)
        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(url)
        bill['subjects'] = self._subjects[bill_id.replace(' ', '')]

        if short_title and bill['title'].lower() != short_title.lower():
            bill.add_title(short_title)

        # documents
        doc_links = html.xpath('//div[contains(@class,"pf-content")]//a')
        for link in doc_links:
            name = link.text_content().strip()
            href = link.get('href')
            if 'Engrossment' in name or 'Bill Text' in name:
                bill.add_version(name, href, mimetype='application/pdf')
            else:
                bill.add_document(name, href)

        def _split(string):
            return re.split(r"\w+[,|AND]\s+", string)

        # sponsors range from a committee to one legislator to a group of legs
        sponsor_lists = bill_tables[0].text_content().split('by')
        if len(sponsor_lists) > 1:
            for sponsors in sponsor_lists[1:]:
                if 'COMMITTEE' in sponsors.upper():
                    bill.add_sponsor('primary', sponsors.strip())
                else:
                    for person in _split(sponsors):
                        person = person.strip()
                        if person != "":
                            bill.add_sponsor('primary', person)

        actor = chamber
        last_date = None
        for row in bill_tables[2]:
            # lots of empty rows
            if len(row) == 1:
                continue
            _, date, action, _ = [x.text_content().strip() for x in row]

            if date:
                last_date = date
            else:
                date = last_date

            date = datetime.datetime.strptime(date + '/' + session[0:4],
                                              "%m/%d/%Y")
            if action.startswith('House'):
                actor = 'lower'
            elif action.startswith('Senate'):
                actor = 'upper'

            # votes
            if 'AYES' in action or 'NAYS' in action:
                vote = self.parse_vote(actor, date, row[2])
                vote.add_source(url)
                bill.add_vote(vote)
            # some td's text is seperated by br elements
            if len(row[2]):
                action = "".join(row[2].itertext())
            action = action.replace(u'\xa0', ' ').strip()
            atype = get_action(actor, action)
            bill.add_action(actor, action, date, type=atype)
            # after voice vote/roll call and some actions the bill is sent
            # 'to House' or 'to Senate'
            if 'to House' in action:
                actor = 'lower'
            elif 'to Senate' in action:
                actor = 'upper'
        self.save_bill(bill)
Esempio n. 13
0
    def scrape_bills(self, session, year_abr):
        #Main Bill information
        main_bill_csv = self.access_to_csv('MainBill')

        # keep a dictionary of bills (mapping bill_id to Bill obj)
        bill_dict = {}

        for rec in main_bill_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["Synopsis"]
            if bill_type[0] == 'A':
                chamber = "lower"
            else:
                chamber = "upper"

            # some bills have a blank title.. just skip it
            if not title:
                continue

            bill = Bill(str(session), chamber, bill_id, title,
                        type=self._bill_types[bill_type[1:]])
            if rec['IdenticalBillNumber'].strip():
                bill.add_companion(rec['IdenticalBillNumber'].split()[0])

            # TODO: last session info is in there too
            bill_dict[bill_id] = bill

        #Sponsors
        bill_sponsors_csv = self.access_to_csv('BillSpon')

        for rec in bill_sponsors_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in sponsor database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            name = rec["Sponsor"]
            sponsor_type = rec["Type"]
            if sponsor_type == 'P':
                sponsor_type = "primary"
            else:
                sponsor_type = "cosponsor"
            bill.add_sponsor(sponsor_type, name)


        #Documents
        bill_document_csv = self.access_to_csv('BillWP')

        for rec in bill_document_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in document database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            document = rec["Document"]
            document = document.split('\\')
            document = document[-2] + "/" + document[-1]
            year = str(year_abr) + str((year_abr + 1))

            #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document)
            htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (year_abr,
                document.replace('.DOC', '.HTM'))

            # name document based _doctype
            try:
                doc_name = self._doctypes[rec['DocType']]
            except KeyError:
                raise Exception('unknown doctype %s on %s' %
                                (rec['DocType'], bill_id))
            if rec['Comment']:
                doc_name += ' ' + rec['Comment']

            # Clean HTMX links.
            if htm_url.endswith('HTMX'):
                htm_url = re.sub('X$', '', htm_url)

            if rec['DocType'] in self._version_types:
                if htm_url.endswith('HTM'):
                    mimetype = 'text/html'
                elif htm_url.endswith('wpd'):
                    mimetype = 'application/vnd.wordperfect'
                try:
                    bill.add_version(doc_name, htm_url, mimetype=mimetype)
                except ValueError:
                    self.warning("Couldn't find a document for bill {}".format(bill_id))
                    pass
            else:
                bill.add_document(doc_name, htm_url)

        # Votes
        next_year = int(year_abr)+1
        vote_info_list = ['A%s' % year_abr,
                          'A%s' % next_year,
                          'S%s' % year_abr,
                          'S%s' % next_year,
                          'CA%s-%s' % (year_abr, next_year),
                          'CS%s-%s' % (year_abr, next_year),
                         ]

        for filename in vote_info_list:
            s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename
            try:
                s_vote_zip, resp = self.urlretrieve(s_vote_url)
            except scrapelib.FTPError:
                self.warning('could not find %s' % s_vote_url)
                continue
            zipedfile = zipfile.ZipFile(s_vote_zip)
            for vfile in ["%s.txt" % (filename), "%sEnd.txt" % (filename)]:
                try:
                    vote_file = zipedfile.open(vfile, 'U')
                except KeyError:
                    #
                    # Right, so, 2011 we have an "End" file with more
                    # vote data than was in the original dump.
                    #
                    self.warning("No such file: %s" % (vfile))
                    continue

                vdict_file = csv.DictReader(vote_file)

                votes = {}
                if filename.startswith('A') or filename.startswith('CA'):
                    chamber = "lower"
                else:
                    chamber = "upper"

                if filename.startswith('C'):
                    vote_file_type = 'committee'
                else:
                    vote_file_type = 'chamber'

                for rec in vdict_file:

                    if vote_file_type == 'chamber':
                        bill_id = rec["Bill"].strip()
                        leg = rec["Full_Name"]

                        date = rec["Session_Date"]
                        action = rec["Action"]
                        leg_vote = rec["Legislator_Vote"]
                    else:
                        bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number'])
                        leg = rec['Name']
                        # drop time portion
                        date = rec['Agenda_Date'].split()[0]
                        # make motion readable
                        action = self._com_vote_motions[rec['BillAction']]
                        # first char (Y/N) use [0:1] to ignore ''
                        leg_vote = rec['LegislatorVote'][0:1]

                    date = datetime.strptime(date, "%m/%d/%Y")
                    vote_id = '_'.join((bill_id, chamber, action))
                    vote_id = vote_id.replace(" ", "_")

                    if vote_id not in votes:
                        votes[vote_id] = Vote(chamber, date, action, None, None,
                                              None, None, bill_id=bill_id)
                    if vote_file_type == 'committee':
                        votes[vote_id]['committee'] = self._committees[
                            rec['Committee_House']]

                    if leg_vote == "Y":
                        votes[vote_id].yes(leg)
                    elif leg_vote == "N":
                        votes[vote_id].no(leg)
                    else:
                        votes[vote_id].other(leg)

            # remove temp file
            os.remove(s_vote_zip)

            #Counts yes/no/other votes and saves overall vote
            for vote in votes.itervalues():
                vote_yes_count = len(vote["yes_votes"])
                vote_no_count = len(vote["no_votes"])
                vote_other_count = len(vote["other_votes"])
                vote["yes_count"] = vote_yes_count
                vote["no_count"] = vote_no_count
                vote["other_count"] = vote_other_count

                # Veto override.
                if vote['motion'] == 'OVERRIDE':
                    # Per the NJ leg's glossary, a veto override requires
                    # 2/3ds of each chamber. 27 in the senate, 54 in the house.
                    # http://www.njleg.state.nj.us/legislativepub/glossary.asp
                    vote['passed'] = False
                    if vote['chamber'] == 'lower':
                        if vote_yes_count >= 54:
                            vote['passed'] = True
                    elif vote['chamber'] == 'upper':
                        if vote_yes_count >= 27:
                            vote['passed'] = True

                # Regular vote.
                elif vote_yes_count > vote_no_count:
                    vote["passed"] = True
                else:
                    vote["passed"] = False
                vote_bill_id = vote["bill_id"]
                bill = bill_dict[vote_bill_id]
                bill.add_vote(vote)

        #Actions
        bill_action_csv = self.access_to_csv('BillHist')
        actor_map = {'A': 'lower', 'G': 'executive', 'S': 'upper'}

        for rec in bill_action_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in action database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            action = rec["Action"]
            date = rec["DateAction"]
            date = datetime.strptime(date, "%m/%d/%y %H:%M:%S")
            actor = actor_map[rec["House"]]
            comment = rec["Comment"]
            action, atype = self.categorize_action(action, bill_id)
            if comment:
                action += (' ' + comment)
            bill.add_action(actor, action, date, type=atype)

        # Subjects
        subject_csv = self.access_to_csv('BillSubj')
        for rec in subject_csv:
            bill_id = rec['BillType'].strip() + str(int(rec['BillNumber']))
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in subject database' % bill_id)
                continue
            bill = bill_dict.get(bill_id)
            if bill:
                bill.setdefault('subjects', []).append(rec['SubjectKey'])
            else:
                self.warning('invalid bill id in BillSubj: %s' % bill_id)

        phony_bill_count = 0
        # save all bills at the end
        for bill in bill_dict.itervalues():
            # add sources
            if not bill['actions'] and not bill['versions']:
                self.warning('probable phony bill detected %s',
                             bill['bill_id'])
                phony_bill_count += 1
            else:
                bill.add_source('http://www.njleg.state.nj.us/downloads.asp')
                self.save_bill(bill)

        if phony_bill_count:
            self.warning('%s total phony bills detected', phony_bill_count)
Esempio n. 14
0
    def scrape(self, chamber, term):
        years = {'102': 2011}
        main_url = 'http://nebraskalegislature.gov/bills/search_by_date.php?SessionDay=%s' % (
            years[term])

        with self.urlopen(main_url) as page:
            page = lxml.html.fromstring(page)

            for docs in page.xpath(
                    '/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[@class="cal_content_full"]/table[@id="bill_results"]/tr/td[1]/a'
            ):
                bill_link = docs.attrib['href']
                bill_link = 'http://nebraskalegislature.gov/' + bill_link
                with self.urlopen(bill_link) as bill_page:
                    bill_page = lxml.html.fromstring(bill_page)

                    #basic info
                    long_title = bill_page.xpath(
                        '/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/h2'
                    )[0].text.split()
                    bill_id = long_title[0]
                    title = ''
                    for x in range(2, len(long_title)):
                        title += long_title[x] + ' '
                    title = title[0:-1]
                    #bill = Bill(term, chamber, bill_id, title)

                    #bill_type
                    if 'LR' in bill_id:
                        bill_type = 'resolution'
                    else:
                        bill_type = 'bill'

                    bill = Bill(term, chamber, bill_id, title, type=bill_type)

                    #sources
                    bill.add_source(main_url)
                    bill.add_source(bill_link)

                    #Sponsor
                    introduced_by = bill_page.xpath(
                        '/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[2]/table/tr[2]/td[1]/a[1]'
                    )[0].text
                    bill.add_sponsor('primary', introduced_by)

                    #actions
                    for actions in bill_page.xpath(
                            '/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[3]/table/tr[1]/td[1]/table/tr'
                    ):
                        date = actions[0].text
                        if 'Date' not in date:
                            date = datetime.strptime(date, '%b %d, %Y')
                            action = actions[1].text

                            if '-' in action:
                                vote_info = action.split()[-1].split('-')
                                yes_count = int(vote_info[0])
                                no_count = int(vote_info[1])
                                abstention_count = int(vote_info[2])
                                if yes_count > no_count:
                                    passed = True
                                else:
                                    passed = False
                                vote = Vote(chamber, date, action, passed,
                                            yes_count, no_count,
                                            abstention_count)
                                vote.add_source(bill_link)
                                bill.add_vote(vote)

                            if 'Governor' in action:
                                actor = 'Governor'
                            elif 'Speaker' in action:
                                actor = 'Speaker'
                            else:
                                actor = chamber

                            action_type = self.action_types(action)
                            bill.add_action(actor, action, date, action_type)

                    #versions
                    for versions in bill_page.xpath(
                            '/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[2]/table/tr[2]/td[2]/a'
                    ):
                        version_url = versions.attrib['href']
                        version_url = 'http://nebraskalegislature.gov/' + version_url[
                            3:len(version_url)]
                        version_name = versions.text
                        bill.add_version(version_name, version_url)

                    #documents
                    #additional_info
                    for additional_info in bill_page.xpath(
                            '/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[2]/table/tr[2]/td/a'
                    ):
                        document_name = additional_info.text
                        document_url = additional_info.attrib['href']
                        document_url = 'http://nebraskalegislature.gov/' + document_url[
                            3:len(document_url)]
                        if '.pdf' in document_url:
                            bill.add_document(document_name, document_url)

                    #amendments
                    for admendments in bill_page.xpath(
                            '/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[3]/table/tr[1]/td[2]/table/tr/td/a'
                    ):
                        admendment_name = admendments.text
                        admendment_url = admendments.attrib['href']
                        admendment_url = 'http://nebraskalegislature.gov/' + admendment_url[
                            3:len(admendment_url)]
                        bill.add_document(admendment_name, admendment_url)

                    #related transcripts
                    for transcripts in bill_page.xpath(
                            '/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[3]/table/tr[2]/td[2]/a'
                    ):
                        transcript_name = transcripts.text
                        transcript_url = transcripts.attrib['href']
                        bill.add_document(transcript_name, transcript_url)

                    self.save_bill(bill)
Esempio n. 15
0
    def scrape_bill(self, chamber, session, bill_id, bill_type):
        url = '%s?r=%s' % (self.base_url, bill_id)
        html = self.get(url).text
        if "error '80020009'" in html:
            self.warning('asp error on page, skipping %s', bill_id)
            return
        doc = lxml.html.fromstring(html)
        # search for Titulo, accent over i messes up lxml, so use 'tulo'
        title = doc.xpath(
            u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()')
        if not title:
            raise NoSuchBill()
        bill = Bill(session, chamber, bill_id, title[0], type=bill_type)
        author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0]
        for aname in author.split(','):
            aname = self.clean_name(aname).strip()
            if aname:
                bill.add_sponsor('primary', aname)
        co_authors = doc.xpath(
            u'//td/b[contains(text(),"Co-autor")]/../text()')
        if len(co_authors) != 0:
            for co_author in co_authors[1].split(','):
                bill.add_sponsor('cosponsor',
                                 self.clean_name(co_author).strip())
        action_table = doc.xpath('//table')[-1]
        for row in action_table[1:]:
            tds = row.xpath('td')
            # ignore row missing date
            if len(tds) != 2:
                continue
            if tds[0].text_content():
                date = datetime.datetime.strptime(tds[0].text_content(),
                                                  "%m/%d/%Y")
            action = tds[1].text_content().strip()
            #parse the text to see if it's a new version or a unrelated document
            #if has a hyphen let's assume it's a vote document

            #get url of action
            action_url = tds[1].xpath('a/@href')
            atype, action = self.parse_action(chamber, bill, action,
                                              action_url, date)

            # Some lower-house roll calls could be parsed, but finnicky
            # Most roll lists are just images embedded within a document,
            # and offer no alt text to scrape
            # Instead, just scrape the vote counts
            vote_info = re.search(
                r'(?u)^(.*),\s([\s\d]{2})-([\s\d]{2})-([\s\d]{2})-([\s\d]{0,2})$',
                action)
            if vote_info and re.search(r'\d{1,2}', action):
                vote_name = vote_info.group(1)

                if u"Votación Final" in vote_name:
                    (vote_chamber,
                     vote_name) = re.search(r'(?u)^\w+ por (.*?) en (.*)$',
                                            vote_name).groups()
                    if "Senado" in vote_chamber:
                        vote_chamber = 'upper'
                    else:
                        vote_chamber = 'lower'

                elif "Cuerpo de Origen" in vote_name:
                    vote_name = re.search(r'(?u)^Cuerpo de Origen (.*)$',
                                          vote_name).group(1)
                    vote_chamber = chamber

                elif u"informe de Comisión de Conferencia" in vote_name:
                    (vote_chamber, vote_name) = re.search(
                        r'(?u)^(\w+) (\w+ informe de Comisi\wn de Conferencia)$',
                        vote_name).groups()
                    if vote_chamber == "Senado":
                        vote_chamber = 'upper'
                    else:
                        vote_chamber = 'lower'

                elif u"Se reconsideró" in vote_name:
                    if bill['votes']:
                        vote_chamber = bill['votes'][-1]['chamber']
                    else:
                        vote_chamber = chamber

                else:
                    raise AssertionError(
                        u"Unknown vote text found: {}".format(vote_name))

                vote_name = vote_name.title()

                yes = int(vote_info.group(2))
                no = int(vote_info.group(3))
                other = 0
                if vote_info.group(4).strip():
                    other += int(vote_info.group(4))
                if vote_info.group(5).strip():
                    other += int(vote_info.group(5))

                vote = Vote(chamber=vote_chamber,
                            date=date,
                            motion=vote_name,
                            passed=(yes > no),
                            yes_count=yes,
                            no_count=no,
                            other_count=other)
                vote.add_source(url)
                bill.add_vote(vote)

        bill.add_source(url)
        self.save_bill(bill)
Esempio n. 16
0
class SenateBillPage(object):
    '''Used for categories, senate votes, events.'''

    def __init__(self, scraper, session, chamber, url, doc, bill_type,
                 bill_id, title, bill_id_parts):
        self.scraper = scraper
        self.chamber = chamber
        self.url = url
        self.doc = doc
        self.bill_id = bill_id
        self.letter, self.number, self.version = bill_id_parts
        self.data = {}
        self.bill = Bill(session, chamber, bill_id, title, type=bill_type)
        self.succeeded = False

        self._build()

        self.bill.add_source(self.url)

    def _build(self):
        self.get_senate_votes()
        self.get_sponsors_memo()
        self.get_subjects()
        self.get_versions()
        self.succeeded = True

    def url2lxml(self, url):
        self.bill.add_source(url)
        return self.scraper.url2lxml(url)

    def get_subjects(self):
        subjects = []
        for link in self.doc.xpath("//a[contains(@href, 'lawsection')]"):
            subjects.append(link.text.strip())

        self.bill['subjects'] = subjects

    def get_sponsors_memo(self):
        if self.chamber == 'upper':
            self.bill.add_document("Sponsor's Memorandum", self.url)

    def get_senate_votes(self):
        for b in self.doc.xpath("//div/b[starts-with(., 'VOTE: FLOOR VOTE:')]"):
            date = b.text.split('-')[1].strip()
            date = datetime.datetime.strptime(date, "%b %d, %Y").date()

            yes_votes, no_votes, other_votes = [], [], []
            yes_count, no_count, other_count = 0, 0, 0
            actual_vote = collections.defaultdict(list)

            vtype = None
            for tag in b.xpath("following-sibling::blockquote/*"):
                if tag.tag == 'b':
                    text = tag.text
                    if text.startswith('Ayes'):
                        vtype = 'yes'
                        yes_count = int(re.search(
                            r'\((\d+)\):', text).group(1))
                    elif text.startswith('Nays'):
                        vtype = 'no'
                        no_count = int(re.search(
                            r'\((\d+)\):', text).group(1))
                    elif (text.startswith('Excused') or
                          text.startswith('Abstain') or
                          text.startswith('Absent')
                         ):
                        vtype = 'other'
                        other_count += int(re.search(
                            r'\((\d+)\):', text).group(1))
                    else:
                        raise ValueError('bad vote type: %s' % tag.text)
                elif tag.tag == 'a':
                    name = tag.text.strip()
                    if vtype == 'yes':
                        yes_votes.append(name)
                    elif vtype == 'no':
                        no_votes.append(name)
                    elif vtype == 'other':
                        other_votes.append((name, tag.text))

            passed = yes_count > (no_count + other_count)

            vote = Vote('upper', date, 'Floor Vote', passed, yes_count,
                        no_count, other_count)

            for name in yes_votes:
                vote.yes(name)
            for name in no_votes:
                vote.no(name)
            for name, vote_val in other_votes:
                vote.other(name)
                actual_vote[vote_val].append(name)

            vote['actual_vote'] = actual_vote
            vote.add_source(self.url)
            self.bill.add_vote(vote)

        for b in self.doc.xpath("//div/b[starts-with(., 'VOTE: COMMITTEE VOTE:')]"):
            _, committee, date = re.split(r'\s*\t+\s*-\s*', b.text)
            date = date.strip()
            date = datetime.datetime.strptime(date, "%b %d, %Y").date()

            yes_votes, no_votes, other_votes = [], [], []
            yes_count, no_count, other_count = 0, 0, 0

            vtype = None
            for tag in b.xpath("following-sibling::blockquote/*"):
                if tag.tag == 'b':
                    text = tag.text
                    if text.startswith('Ayes'):
                        vtype = 'yes'
                        yes_count += int(re.search(
                            r'\((\d+)\):', text).group(1))
                    elif text.startswith('Nays'):
                        vtype = 'no'
                        no_count += int(re.search(
                            r'\((\d+)\):', text).group(1))
                    elif (text.startswith('Excused') or
                          text.startswith('Abstain') or
                          text.startswith('Absent')
                         ):
                        vtype = 'other'
                        other_count += int(re.search(
                            r'\((\d+)\):', text).group(1))
                    else:
                        raise ValueError('bad vote type: %s' % tag.text)
                elif tag.tag == 'a':
                    name = tag.text.strip()
                    if vtype == 'yes':
                        yes_votes.append(name)
                    elif vtype == 'no':
                        no_votes.append(name)
                    elif vtype == 'other':
                        other_votes.append(name)

            passed = yes_count > (no_count + other_count)

            vote = Vote('upper', date, '%s Committee Vote' % committee,
                        passed, yes_count, no_count, other_count)

            for name in yes_votes:
                vote.yes(name)
            for name in no_votes:
                vote.no(name)
            for name in other_votes:
                vote.other(name)

            vote.add_source(self.url)
            self.bill.add_vote(vote)

    def get_versions(self):
        text = self.doc.xpath('//*[contains(., "Versions")]')[-1].text_content()
        version_text = re.sub('Versions:?\s*', '', text)

        url_tmpl = 'http://open.nysenate.gov/legislation/bill/'
        for version_bill_id in re.findall('\S+', version_text):
            version_bill_id_noyear, _ = version_bill_id.rsplit('-')
            version_url = url_tmpl + version_bill_id
            self.bill.add_version(version_bill_id_noyear, version_url,
                                  mimetype='text/html')
Esempio n. 17
0
class AssemblyBillPage(object):
    '''Get the actions, sponsors, sponsors memo and summary
    and assembly floor votes from the assembly page.
    '''

    metadata = metadata('ny')

    def __init__(self, scraper, session, chamber, url, doc, bill_type,
                 bill_id, title, bill_id_parts):
        self.scraper = scraper
        self.session = session
        self.term = term_for_session('ny', session)
        for data in self.metadata['terms']:
            if session in data['sessions']:
                self.termdata = data
            self.term_start_year = data['start_year']
        self.chamber = chamber
        self.url = url
        self.doc = doc
        self.bill_id = bill_id
        self.letter, self.number, self.version = bill_id_parts
        self.data = {}
        self.bill = Bill(session, chamber, bill_id, title, type=bill_type)
        self.succeeded = False

        self._build()

    def _build(self):
        if not self.doc.xpath('//pre/text()'):
            return
        self.get_actions()
        self.get_sponsors_memo()
        self.get_sponsors()
        self.get_summary()
        self.get_companions()
        self.get_lower_votes()
        self.get_version()
        self.succeeded = True
        self.bill.add_source(self.url)

    def _get_chunks(self):
        if 'summary' not in self.data:
            url = ('http://assembly.state.ny.us/leg/?default_fld=&'
                   'bn=%s&Summary=Y&Actions=Y&term=%s')
            url = url % (self.bill_id, self.term_start_year)
            doc = self.url2lxml(url)
            summary, actions = doc.xpath('//pre')[:2]
            summary = summary.text_content()
            actions = actions.text_content()
            self.data['summary'] = summary
            self.data['actions'] = actions
            return summary, actions
        else:
            return self.data['summary'], self.data['actions']

    def url2lxml(self, url):
        self.bill.add_source(url)
        return self.scraper.url2lxml(url)

    def get_version(self):
        url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn=%s&term=%s'
        url = url % (self.bill_id, self.term_start_year)
        version = self.bill_id
        self.bill.add_version(version, url, mimetype='text/html')

    def get_companions(self):
        summary, _ = self._get_chunks()
        chunks = summary.split('\n\n')
        for chunk in chunks:
            if chunk.startswith('SAME AS'):
                companions = chunk.replace('SAME AS    ', '')
                if companions != 'No same as':
                    for companion in re.split(r'\s*[\,\\]\s*', companions):
                        companion = re.sub(r'^Same as ', '', companion)
                        companion = re.sub(r'^Uni', '', companion)
                        companion = re.sub(r'\-\w+$', '', companion)
                        self.bill.add_companion(companion)

    def get_sponsors_memo(self):
        if self.chamber == 'lower':
            url = ('http://assembly.state.ny.us/leg/?'
                   'default_fld=&bn=%s&term=%s&Memo=Y')
            url = url % (self.bill_id, self.term_start_year)
            self.bill.add_document("Sponsor's Memorandum", url)

    def get_summary(self):
        summary, _ = self._get_chunks()
        chunks = summary.split('\n\n')
        self.bill['summary'] = chunks[-1]

    def _scrub_name(self, name):
        junk = [
            r'^Rules\s+',
            '\(2nd Vice Chairperson\)',
            '\(MS\)',
            'Assemblyman',
            'Assemblywoman',
            'Senator']
        for rgx in junk:
            name = re.sub(rgx, '', name, re.I)

        # Collabpse whitespace.
        name = re.sub('\s+', ' ', name)
        return name.strip('(), ')

    def get_sponsors(self):
        summary, _ = self._get_chunks()
        chunks = summary.split('\n\n')
        for chunk in chunks:
            for sponsor_type in ('SPONSOR', 'COSPNSR', 'MLTSPNSR'):
                if chunk.startswith(sponsor_type):
                    _, data = chunk.split(' ', 1)
                    for sponsor in re.split(r',\s+', data.strip()):

                        if not sponsor:
                            continue

                        # If it's a "Rules" bill, add the Rules committee
                        # as the primary.
                        if sponsor.startswith('Rules'):
                            self.bill.add_sponsor('primary', 'Rules Committee',
                                                  chamber='lower')

                        sponsor = self._scrub_name(sponsor)

                        # Figure out sponsor type.
                        spons_swap = {'SPONSOR': 'primary'}
                        _sponsor_type = spons_swap.get(
                            sponsor_type, 'cosponsor')

                        self.bill.add_sponsor(_sponsor_type, sponsor.strip(),
                                         official_type=sponsor_type)

    def get_actions(self):
        _, actions = self._get_chunks()
        categorizer = self.scraper.categorizer
        actions_rgx = r'(\d{2}/\d{2}/\d{4})\s+(.+)'
        actions_data = re.findall(actions_rgx, actions)
        for date, action in actions_data:
            date = datetime.datetime.strptime(date, r'%m/%d/%Y')
            act_chamber = ('upper' if action.isupper() else 'lower')
            types, attrs = categorizer.categorize(action)
            self.bill.add_action(act_chamber, action, date, type=types, **attrs)
            # Bail if the bill has been substituted by another.
            if 'substituted by' in action:
                return

    def get_lower_votes(self):

        url = ('http://assembly.state.ny.us/leg/?'
               'default_fld=&bn=%s&term=%s&Votes=Y')
        url = url % (self.bill_id, self.term_start_year)
        doc = self.url2lxml(url)
        if doc is None:
            return

        pre = doc.xpath('//pre')[0].text_content()
        no_votes = ('There are no votes for this bill in this '
                    'legislative session.')
        if pre == no_votes:
            return

        actual_vote = collections.defaultdict(list)
        for table in doc.xpath('//table'):

            date = table.xpath('caption/label[contains(., "DATE:")]')
            date = date[0].itersiblings().next().text
            date = datetime.datetime.strptime(date, '%m/%d/%Y')

            votes = table.xpath('caption/span/label[contains(., "YEA/NAY:")]')
            votes = votes[0].itersiblings().next().text
            yes_count, no_count = map(int, votes.split('/'))

            passed = yes_count > no_count
            vote = Vote('lower', date, 'Floor Vote', passed, yes_count,
                        no_count, other_count=0)

            tds = table.xpath('tr/td/text()')
            votes = iter(tds)
            while True:
                try:
                    data = list(islice(votes, 2))
                    name, vote_val = data
                except (StopIteration, ValueError):
                    # End of data. Stop.
                    break
                name = self._scrub_name(name)

                if vote_val.strip() == 'Y':
                    vote.yes(name)
                elif vote_val.strip() in ('N', 'NO'):
                    vote.no(name)
                else:
                    vote.other(name)
                    actual_vote[vote_val].append(name)

            # The page doesn't provide an other_count.
            vote['other_count'] = len(vote['other_votes'])
            vote['actual_vote'] = actual_vote
            self.bill.add_vote(vote)
Esempio n. 18
0
    def scrape_bill_sheet(self, session, chamber):
        sheet_url = self.get_bill_folder(session, chamber)

        bill_chamber = {"Senate": "upper", "House": "lower"}[chamber]

        index = {
            "id": 0,
            "title_sponsor": 1,
            "version": 2,
            "history": 3,
            "votes": 7
        }

        with self.urlopen(sheet_url) as sheet_html:
            sheet_page = lxml.html.fromstring(sheet_html)

            bills = sheet_page.xpath('//table/tr')

            for bill in bills:
                bill_id = self.read_td(bill[index["id"]][0])

                if bill_id == None:
                    # Every other entry is null for some reason
                    continue

                bill_id = bill_id[:bill_id.find(".")]
                title_and_sponsor = bill[index["title_sponsor"]][0]

                bill_title = title_and_sponsor.text
                bill_title_and_sponsor = title_and_sponsor.text_content()
                sponsors = bill_title_and_sponsor.replace(bill_title, "").\
                    replace(" & ...", "").split("--")

                bill_history_href = CO_URL_BASE + \
                    bill[index["history"]][0][0].attrib['href']
                # ^^^^^^^ We assume this is a full path to the target.
                # might want to consider some better rel-path support
                # XXX: Look at this ^

                history = self.parse_history(bill_history_href)
                b = Bill(session, bill_chamber, bill_id, bill_title)

                for action in history:
                    self.add_action_to_bill(b, action)

                for sponsor in sponsors:
                    b.add_sponsor("primary", sponsor)

                # Now that we have history, let's see if we can't grab some
                # votes

                bill_vote_href = self.get_vote_url(bill_id, session)
                votes = self.parse_votes(bill_vote_href)

                if votes['sanity-check'] != bill_id:
                    print "XXX: READ ME!"
                    print " -> Scraped ID: " + votes['sanity-check']
                    print " -> 'Real' ID:  " + bill_id
                    assert votes['sanity-check'] == bill_id

                for vote in votes['votes']:
                    print vote
                    filed_votes = vote['votes']
                    passage = vote['meta']
                    result = vote['result']

                    composite_time = "%s %s" % (passage['x-parent-date'],
                                                passage['TIME'])
                    # It's now like: 04/01/2011 02:10:14 PM
                    pydate = dt.datetime.strptime(composite_time,
                                                  "%m/%d/%Y %I:%M:%S %p")
                    hasHouse = "House" in passage['x-parent-ctty']
                    hasSenate = "Senate" in passage['x-parent-ctty']

                    if hasHouse and hasSenate:
                        actor = "legislature"
                    elif hasHouse:
                        actor = "lower"
                    else:
                        actor = "upper"

                    v = Vote(actor,
                             pydate,
                             passage['MOTION'],
                             (result['FINAL_ACTION'] == "YES"),
                             int(result['YES']),
                             int(result['NO']),
                             int(result['EXC'] + result['ABS']),
                             moved=passage['MOVED'],
                             seconded=passage['SECONDED'])
                    # XXX: Add more stuff to kwargs, we have a ton of data
                    for voter in filed_votes:
                        who = voter
                        vote = filed_votes[who]
                        if vote.lower() == "yes":
                            v.yes(who)
                        elif vote.lower() == "no":
                            v.no(who)
                        else:
                            v.other(who)
                    v.add_source(bill_vote_href)
                    b.add_vote(v)
                self.save_bill(b)
Esempio n. 19
0
    def scrape(self, session, chambers):
        bill_type_map = {
            'B': 'bill',
            'R': 'resolution',
            'JR': 'joint resolution',
            'CR': 'concurrent resolution',
        }

        chamber_map = {
            'H': 'lower',
            'S': 'upper',
            'J': 'joint',
            'E': 'other', # Effective date
        }

        action_code_map = {
            'HI': ['other'],
            'SI': ['other'],
            'HH': ['other'],
            'SH': ['other'],
            'HPF': ['bill:introduced'],
            'HDSAS': ['other'],
            'SPF': ['bill:introduced'],
            'HSR': ['bill:reading:2'],
            'SSR': ['bill:reading:2'],
            'HFR': ['bill:reading:1'],
            'SFR': ['bill:reading:1'],
            'HRECM': ['bill:withdrawn', 'committee:referred'],
            'SRECM': ['bill:withdrawn', 'committee:referred'],
            'SW&C': ['bill:withdrawn', 'committee:referred'],
            'HW&C': ['bill:withdrawn', 'committee:referred'],
            'HRA': ['bill:passed'],
            'SRA': ['bill:passed'],
            'HPA': ['bill:passed'],
            'HRECO': ['other'],
            'SPA': ['bill:passed'],
            'HTABL': ['other'],  # 'House Tabled' - what is this?
            'SDHAS': ['other'],
            'HCFR': ['committee:passed:favorable'],
            'SCFR': ['committee:passed:favorable'],
            'HRAR': ['committee:referred'],
            'SRAR': ['committee:referred'],
            'STR': ['bill:reading:3'],
            'SAHAS': ['other'],
            'SE': ['bill:passed'],
            'SR': ['committee:referred'],
            'HTRL': ['bill:reading:3', 'bill:failed'],
            'HTR': ['bill:reading:3'],
            'S3RLT': ['bill:reading:3', 'bill:failed'],
            'HASAS': ['other'],
            'S3RPP': ['other'],
            'STAB': ['other'],
            'SRECO': ['other'],
            'SAPPT': ['other'],
            'HCA': ['other'],
            'HNOM': ['other'],
            'HTT': ['other'],
            'STT': ['other'],
            'SRECP': ['other'],
            'SCRA': ['other'],
            'SNOM': ['other'],
            'S2R': ['bill:reading:2'],
            'H2R': ['bill:reading:2'],
            'SENG': ['bill:passed'],
            'HENG': ['bill:passed'],
            'HPOST': ['other'],
            'HCAP': ['other'],
            'SDSG': ['governor:signed'],
            'SSG': ['governor:received'],
            'Signed Gov': ['governor:signed'],
            'HDSG': ['governor:signed'],
            'HSG': ['governor:received'],
            'EFF': ['other'],
            'HRP': ['other'],
            'STH': ['other'],
            'HTS': ['other'],
        }

        sid = self.metadata['session_details'][session]['_guid']
        legislation = backoff(
            self.lservice.GetLegislationForSession,
            sid
        )['LegislationIndex']

        for leg in legislation:
            lid = leg['Id']
            instrument = backoff(self.lservice.GetLegislationDetail, lid)
            history = [x for x in instrument['StatusHistory'][0]]

            actions = reversed([{
                'code': x['Code'],
                'action': x['Description'],
                '_guid': x['Id'],
                'date': x['Date']
            } for x in history])

            guid = instrument['Id']

            # A little bit hacky.
            bill_prefix = instrument['DocumentType']
            bill_chamber = chamber_map[bill_prefix[0]]
            bill_type = bill_type_map[bill_prefix[1:]]

            bill_id = '%s %s' % (
                bill_prefix,
                instrument['Number'],
            )
            if instrument['Suffix']:
                bill_id += instrument['Suffix']

            title = instrument['Caption']
            description = instrument['Summary']

            if title is None:
                continue

            bill = Bill(session, bill_chamber, bill_id, title, type=bill_type,
                description=description, _guid=guid)

            if instrument['Votes']:
                for vote_ in instrument['Votes']:
                    _, vote_ = vote_
                    vote_ = backoff(self.vservice.GetVote, vote_[0]['VoteId'])

                    vote = Vote(
                        {'House': 'lower', 'Senate': 'upper'}[vote_['Branch']],
                        vote_['Date'],
                        vote_['Caption'] or 'Vote on Bill',
                        (vote_['Yeas'] > vote_['Nays']),
                        vote_['Yeas'],
                        vote_['Nays'],
                        (vote_['Excused'] + vote_['NotVoting']),
                        session=session,
                        bill_id=bill_id,
                        bill_chamber=bill_chamber)

                    vote.add_source(self.vsource)

                    methods = {'Yea': vote.yes, 'Nay': vote.no,}

                    for vdetail in vote_['Votes'][0]:
                        whom = vdetail['Member']
                        how = vdetail['MemberVoted']
                        try:
                            m = methods[how]
                        except KeyError:
                            m = vote.other
                        m(whom['Name'])

                    bill.add_vote(vote)

            ccommittees = defaultdict(list)
            committees = instrument['Committees']
            if committees:
                for committee in committees[0]:
                    ccommittees[{
                        'House': 'lower',
                        'Senate': 'upper',
                    }[committee['Type']]].append(committee['Name'])

            for action in actions:
                action_chamber = chamber_map[action['code'][0]]

                try:
                    action_types = action_code_map[action['code']]
                except KeyError:
                    error_msg = ('Code {code} for action {action} not '
                        'recognized.'.format(
                            code=action['code'],
                            action=action['action']))

                    self.logger.warning(error_msg)

                    action_types = ['other']

                committees = []
                if any(('committee' in x for x in action_types)):
                    committees = [str(x) for x in ccommittees.get(
                        action_chamber, [])]

                bill.add_action(action_chamber, action['action'],
                    action['date'], action_types, committees=committees,
                    _code=action['code'], _code_id=action['_guid'])

            sponsors = []
            if instrument['Authors']:
                sponsors = instrument['Authors']['Sponsorship']
                if 'Sponsors' in instrument and instrument['Sponsors']:
                    sponsors += instrument['Sponsors']['Sponsorship']

            sponsors = [
                (x['Type'], self.get_member(x['MemberId'])) for x in sponsors
            ]

            for typ, sponsor in sponsors:
                name = '{First} {Last}'.format(**dict(sponsor['Name']))
                bill.add_sponsor(
                    'primary' if 'Author' in typ else 'seconday',
                     name
                )

            for version in instrument['Versions']['DocumentDescription']:
                name, url, doc_id, version_id = [
                    version[x] for x in [
                        'Description',
                        'Url',
                        'Id',
                        'Version'
                    ]
                ]
                bill.add_version(
                    name,
                    url,
                    mimetype='application/pdf',
                    _internal_document_id=doc_id,
                    _version_id=version_id
                )

            versions = sorted(
                bill['versions'],
                key=lambda x: x['_internal_document_id']
            )
            bill['versions'] = versions

            bill.add_source(self.msource)
            bill.add_source(self.lsource)
            bill.add_source(SOURCE_URL.format(**{
                'session': session,
                'bid': guid,
            }))
            self.save_bill(bill)
Esempio n. 20
0
    def scrape_pre_2009_bill(self, chamber, session, bill_id, short_title=''):
        """bills from 2008 and below are in a 'pre' element and is simpler to
        parse them as text"""
        url = 'http://legislature.idaho.gov/legislation/%s/%s.html' % (
            session, bill_id.replace(' ', ''))
        with self.urlopen(url) as bill_page:
            html = lxml.html.fromstring(bill_page)
            text = html.xpath('//pre')[0].text.split('\r\n')

            # title
            title = " - ".join(
                [x.strip() for x in text[1].split('-') if x.isupper()])
            # bill type
            bill_type = get_bill_type(bill_id)

            bill = Bill(session, chamber, bill_id, title, type=bill_type)
            # sponsors
            sponsors = text[0].split('by')[-1]
            for sponsor in sponsors.split(','):
                bill.add_sponsor('primary', sponsor)

            actor = chamber
            self.flag()  # clear last bills vote flags
            self.vote = None  #

            for line in text:

                if re.match(r'^\d\d/\d\d', line):
                    date = date = datetime.datetime.strptime(
                        line[0:5] + '/' + session[0:4], "%m/%d/%Y")
                    self.last_date = date
                    action_text = line[5:].strip()
                    # actor
                    if action_text.lower().startswith('house') or \
                       action_text.lower().startswith('senate'):
                        actor = {'H': 'lower', 'S': 'upper'}[action_text[0]]

                    action = get_action(actor, action_text)
                    bill.add_action(actor, action_text, date, type=action)
                    if "bill:passed" in action or "bill:failed" in action:
                        passed = False if 'FAILED' in action_text else True
                        votes = re.search(r'(\d+)-(\d+)-(\d+)', action_text)
                        if votes:
                            yes, no, other = votes.groups()
                            self.in_vote = True
                            self.vote = Vote(chamber, date, action_text,
                                             passed, int(yes), int(no),
                                             int(other))
                else:
                    date = self.last_date
                    # nothing to do if its not a vote
                    if "Floor Sponsor" in line:
                        self.in_vote = False
                        if self.vote:
                            bill.add_vote(self.vote)
                            self.vote = None

                    if not self.in_vote:
                        continue
                    if 'AYES --' in line:
                        self.flag(ayes=True)
                    elif 'NAYS --' in line:
                        self.flag(nays=True)
                    elif 'Absent and excused' in line:
                        self.flag(other=True)

                    if self.ayes:
                        for name in line.replace('AYES --', '').split(','):
                            name = name.strip()
                            if name:
                                self.vote.yes(name)

                    if self.nays:
                        for name in line.replace('NAYS --', '').split(','):
                            name = name.strip()
                            if name:
                                self.vote.no(name)

                    if self.other:
                        for name in line.replace('Absent and excused --',
                                                 '').split(','):
                            name = name.strip()
                            if name:
                                self.vote.other(name)

            self.save_bill(bill)
Esempio n. 21
0
    def scrape_bills(self, chamber_to_scrape, session):
        url = 'http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml' % session

        with self.urlopen(url) as bill_dir_page:
            root = lxml.etree.fromstring(bill_dir_page, lxml.etree.HTMLParser())
            for mr in root.xpath('//lastaction/msrgroup'):
                bill_id = mr.xpath('string(measure)').replace(" ", "")
                if bill_id[0] == "S":
                    chamber = "upper"
                else:
                    chamber = "lower"

                bill_type = {'B':'bill', 'C': 'concurrent resolution',
                             'R': 'resolution', 'N': 'nomination'}[bill_id[1]]

                # just skip past bills that are of the wrong chamber
                if chamber != chamber_to_scrape:
                    continue

                link = mr.xpath('string(actionlink)').replace("..", "")
                main_doc = mr.xpath('string(measurelink)').replace("../../../", "")
                main_doc_url = 'http://billstatus.ls.state.ms.us/%s' % main_doc
                bill_details_url = 'http://billstatus.ls.state.ms.us/%s/pdf/%s' % (session, link)
                with self.urlopen(bill_details_url) as details_page:
                    details_page = details_page.decode('latin1').encode('utf8', 'ignore')
                    details_root = lxml.etree.fromstring(details_page, lxml.etree.HTMLParser())
                    title = details_root.xpath('string(//shorttitle)')
                    longtitle = details_root.xpath('string(//longtitle)')

                    bill = Bill(session, chamber, bill_id, title,
                                type=bill_type, longtitle=longtitle)

                    #sponsors
                    main_sponsor = details_root.xpath('string(//p_name)').split()
                    if main_sponsor:
                        main_sponsor = main_sponsor[0]
                        main_sponsor_link = details_root.xpath('string(//p_link)').replace(" ", "_")
                        main_sponsor_url =  'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, main_sponsor_link)
                        type = "primary"
                        bill.add_sponsor(type, main_sponsor, main_sponsor_url = main_sponsor_url)
                    for author in details_root.xpath('//authors/additional'):
                        leg = author.xpath('string(co_name)').replace(" ", "_")
                        if leg:
                            leg_url = 'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, leg)
                            type = "cosponsor"
                            bill.add_sponsor(type, leg, leg_url=leg_url)

                    #Versions 
                    curr_version = details_root.xpath('string(//current_other)').replace("../../../../", "")
                    curr_version_url = "http://billstatus.ls.state.ms.us/" + curr_version
                    bill.add_version("Current version", curr_version_url)

                    intro_version = details_root.xpath('string(//intro_other)').replace("../../../../", "")
                    intro_version_url = "http://billstatus.ls.state.ms.us/" + intro_version
                    bill.add_version("As Introduced", intro_version_url)

                    comm_version = details_root.xpath('string(//cmtesub_other)').replace("../../../../", "")
                    if comm_version.find("documents") != -1:
                        comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version
                        bill.add_version("Committee Substitute", comm_version_url)

                    passed_version = details_root.xpath('string(//passed_other)').replace("../../../../", "")
                    if passed_version.find("documents") != -1:
                        passed_version_url = "http://billstatus.ls.state.ms.us/" + passed_version
                        title = "As Passed the " + chamber
                        bill.add_version(title, passed_version_url)

                    asg_version = details_root.xpath('string(//asg_other)').replace("../../../../", "")
                    if asg_version.find("documents") != -1:
                        asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version
                        bill.add_version("Approved by the Governor", asg_version_url)

                    # avoid duplicate votes
                    seen_votes = set()

                    #Actions
                    for action in details_root.xpath('//history/action'):
                        action_num  = action.xpath('string(act_number)').strip()
                        action_num = int(action_num)
                        act_vote = action.xpath('string(act_vote)').replace("../../../..", "")
                        action_desc = action.xpath('string(act_desc)')
                        date, action_desc = action_desc.split(" ", 1)
                        date = date + "/" + session[0:4]
                        date = datetime.strptime(date, "%m/%d/%Y")

                        if action_desc.startswith("(H)"):
                            actor = "lower"
                            action = action_desc[4:]
                        elif action_desc.startswith("(S)"):
                            actor = "upper"
                            action = action_desc[4:]
                        else:
                            actor = "executive"
                            action = action_desc

                        if action.find("Veto") != -1:
                            version_path = details_root.xpath("string(//veto_other)")
                            version_path = version_path.replace("../../../../", "")
                            version_url = "http://billstatus.ls.state.ms.us/" + version_path
                            bill.add_document("Veto", version_url) 

                        atype = 'other'
                        for prefix, prefix_type in self._action_types:
                            if action.startswith(prefix):
                                atype = prefix_type
                                break

                        bill.add_action(actor, action, date, type=atype,
                                        action_num=action_num)

                        # use committee names as scraped subjects
                        subjects = details_root.xpath('//h_name/text()')
                        subjects += details_root.xpath('//s_name/text()')
                        bill['subjects'] = subjects

                        if act_vote:
                            vote_url = 'http://billstatus.ls.state.ms.us%s' % act_vote
                            if vote_url not in seen_votes:
                                seen_votes.add(vote_url)
                                vote = self.scrape_votes(vote_url, action,
                                                         date, actor)
                                vote.add_source(vote_url)
                                bill.add_vote(vote)

                    bill.add_source(bill_details_url)
                    self.save_bill(bill)
Esempio n. 22
0
    def bill_info(self, bill_link, session, chamber, main_url, bill_page):
           

        bill_page = lxml.html.fromstring(bill_page)

        #basic info
        try:
            long_title = bill_page.xpath('//div[@id="content_text"]/h2')[0].text.split()
        except IndexError:
            return None
        bill_id = long_title[0]
        title = ''
        for x in range(2, len(long_title)):
            title += long_title[x] + ' '
        title = title[0:-1]

        #bill_type
        bill_type = 'resolution' if 'LR' in bill_id else 'bill'            

        bill = Bill(session, chamber, bill_id, title, type = bill_type)
                    
        #sources
        bill.add_source(main_url)
        bill.add_source(bill_link)
                    
        #Sponsor
        introduced_by = bill_page.xpath('//div[@id="content_text"]/div[2]/table/tr[2]/td[1]/a[1]')[0].text
        bill.add_sponsor('primary', introduced_by)

        #actions
        for actions in bill_page.xpath('//div[@id="content_text"]/div[3]/table/tr[1]/td[1]/table/tr'):
            date = actions[0].text
            if 'Date' not in date:
                date = datetime.strptime(date, '%b %d, %Y')
                action = actions[1].text

                if '-' in action:
                    vote_info = action.split()[-1].split('-')
                    yes_count = int(vote_info[0])
                    no_count = int(vote_info[1])
                    abstention_count = int(vote_info[2])
                    passed = True if ( yes_count > no_count) else False
                    
                    vote = Vote(chamber, date, action, passed, yes_count, no_count, abstention_count)
                    vote.add_source(bill_link)
                    bill.add_vote(vote)

                if 'Governor' in action:
                    actor = 'Governor'
                elif 'Speaker' in action:
                    actor = 'Speaker'
                else:
                    actor = chamber

                action_type = self.action_types(action)
                bill.add_action(actor, action, date, action_type)
                    
        #versions
        for versions in bill_page.xpath('//div[@id="content_text"]/div[2]/table/tr[2]/td[2]/a'):
            version_url = versions.attrib['href']
            version_url = 'http://nebraskalegislature.gov/' + version_url[3:len(version_url)]
            version_name = versions.text
            bill.add_version(version_name, version_url)

                        
        #documents
        #additional_info
        for additional_info in bill_page.xpath('//div[@id="content_text"]/div[2]/table/tr[2]/td/a'):
            document_name = additional_info.text
            document_url = additional_info.attrib['href']
            document_url = 'http://nebraskalegislature.gov/' + document_url[3:len(document_url)]
            if '.pdf' in document_url:
                bill.add_document(document_name, document_url)

        #amendments
        for admendments in bill_page.xpath('//div[@id="content_text"]/div[3]/table/tr[1]/td[2]/table/tr/td/a'):
            admendment_name = admendments.text
            admendment_url = admendments.attrib['href']
            admendment_url = 'http://nebraskalegislature.gov/' + admendment_url[3:len(admendment_url)]
            bill.add_document(admendment_name, admendment_url)

        #related transcripts
        for transcripts in bill_page.xpath('//div[@id="content_text"]/div[3]/table/tr[2]/td[2]/a'):
            transcript_name = transcripts.text
            transcript_url = transcripts.attrib['href']
            bill.add_document(transcript_name, transcript_url)

        self.save_bill(bill)
Esempio n. 23
0
    def scrape(self, session, chambers):
        sid = self.metadata['session_details'][session]['_guid']
        legislation = backoff(self.lservice.GetLegislationForSession,
                              sid)['LegislationIndex']
        for leg in legislation:
            lid = leg['Id']
            instrument = backoff(self.lservice.GetLegislationDetail, lid)
            history = [x for x in instrument['StatusHistory'][0]]
            actions = [{
                "code": x['Code'],
                "action": x['Description'],
                "_guid": x['Id'],
                "date": x['Date']
            } for x in history]

            guid = instrument['Id']

            bill_type = instrument['DocumentType']
            chamber = {
                "H": "lower",
                "S": "upper",
                "J": "joint"
            }[bill_type[0]]  # XXX: This is a bit of a hack.

            bill_id = "%s %s" % (
                bill_type,
                instrument['Number'],
            )
            if instrument['Suffix']:
                bill_id += instrument['Suffix']

            title = instrument['Caption']
            description = instrument['Summary']

            if title is None:
                continue

            bill = Bill(session,
                        chamber,
                        bill_id,
                        title,
                        description=description,
                        _guid=guid)

            if instrument['Votes']:
                for vote_ in instrument['Votes']:
                    _, vote_ = vote_
                    vote_ = backoff(self.vservice.GetVote, vote_[0]['VoteId'])

                    vote = Vote({
                        "House": "lower",
                        "Senate": "upper"
                    }[vote_['Branch']],
                                vote_['Date'],
                                vote_['Caption'] or "Vote on Bill",
                                (vote_['Yeas'] > vote_['Nays']),
                                vote_['Yeas'],
                                vote_['Nays'],
                                (vote_['Excused'] + vote_['NotVoting']),
                                session=session,
                                bill_id=bill_id,
                                bill_chamber=chamber)

                    vote.add_source(self.vsource)

                    methods = {
                        "Yea": vote.yes,
                        "Nay": vote.no,
                    }

                    for vdetail in vote_['Votes'][0]:
                        whom = vdetail['Member']
                        how = vdetail['MemberVoted']
                        try:
                            m = methods[how]
                        except KeyError:
                            m = vote.other
                        m(whom['Name'])

                    bill.add_vote(vote)

            types = {
                "HI": ["other"],
                "SI": ["other"],
                "HH": ["other"],
                "SH": ["other"],
                "HPF": ["bill:introduced"],
                "HDSAS": ["other"],
                "SPF": ["bill:introduced"],
                "HSR": ["bill:reading:2"],
                "SSR": ["bill:reading:2"],
                "HFR": ["bill:reading:1"],
                "SFR": ["bill:reading:1"],
                "HRECM": ["bill:withdrawn", "committee:referred"],
                "SRECM": ["bill:withdrawn", "committee:referred"],
                "SW&C": ["bill:withdrawn", "committee:referred"],
                "HW&C": ["bill:withdrawn", "committee:referred"],
                "HRA": ["bill:passed"],
                "SRA": ["bill:passed"],
                "HPA": ["bill:passed"],
                "HRECO": ["other"],
                "SPA": ["bill:passed"],
                "HTABL": ["other"],  # "House Tabled" - what is this?
                "SDHAS": ["other"],
                "HCFR": ["committee:passed:favorable"],
                "SCFR": ["committee:passed:favorable"],
                "HRAR": ["committee:referred"],
                "SRAR": ["committee:referred"],
                "STR": ["bill:reading:3"],
                "SAHAS": ["other"],
                "SE": ["bill:passed"],
                "SR": ["committee:referred"],
                "HTRL": ["bill:reading:3", "bill:failed"],
                "HTR": ["bill:reading:3"],
                "S3RLT": ["bill:reading:3", "bill:failed"],
                "HASAS": ["other"],
                "S3RPP": ["other"],
                "STAB": ["other"],
                "SRECO": ["other"],
                "SAPPT": ["other"],
                "HCA": ["other"],
                "HNOM": ["other"],
                "HTT": ["other"],
                "STT": ["other"],
                "SRECP": ["other"],
                "SCRA": ["other"],
                "SNOM": ["other"],
                "S2R": ["bill:reading:2"],
                "H2R": ["bill:reading:2"],
                "SENG": ["bill:passed"],
                "HENG": ["bill:passed"],
                "HPOST": ["other"],
                "HCAP": ["other"],
                "SDSG": ["governor:signed"],
                "SSG": ["governor:received"],
                "Signed Gov": ["governor:signed"],
                "HDSG": ["governor:signed"],
                "HSG": ["governor:received"],
                "EFF": ["other"],
                "HRP": ["other"],
                "STH": ["other"],
                "HTS": ["other"],
            }

            ccommittees = defaultdict(list)
            committees = instrument['Committees']
            if committees:
                for committee in committees[0]:
                    ccommittees[{
                        "House": "lower",
                        "Senate": "upper",
                    }[committee['Type']]].append(committee['Name'])

            for action in actions:
                chamber = {
                    "H": "lower",
                    "S": "upper",
                    "E": "other",  # Effective Date
                }[action['code'][0]]

                try:
                    _types = types[action['code']]
                except KeyError:
                    self.debug(action)
                    _types = ["other"]

                committees = []
                if any(('committee' in x for x in _types)):
                    committees = [str(x) for x in ccommittees.get(chamber, [])]

                bill.add_action(chamber,
                                action['action'],
                                action['date'],
                                _types,
                                committees=committees,
                                _code=action['code'],
                                _code_id=action['_guid'])

            sponsors = []
            if instrument['Authors']:
                sponsors = instrument['Authors']['Sponsorship']
                if 'Sponsors' in instrument and instrument['Sponsors']:
                    sponsors += instrument['Sponsors']['Sponsorship']

            sponsors = [(x['Type'], self.get_member(x['MemberId']))
                        for x in sponsors]

            for typ, sponsor in sponsors:
                name = "{First} {Last}".format(**dict(sponsor['Name']))
                bill.add_sponsor('primary' if 'Author' in typ else 'seconday',
                                 name)

            for version in instrument['Versions']['DocumentDescription']:
                name, url, doc_id, version_id = [
                    version[x]
                    for x in ['Description', 'Url', 'Id', 'Version']
                ]
                bill.add_version(name,
                                 url,
                                 mimetype='application/pdf',
                                 _internal_document_id=doc_id,
                                 _version_id=version_id)

            bill.add_source(self.msource)
            bill.add_source(self.lsource)
            bill.add_source(
                SOURCE_URL.format(**{
                    "session": session,
                    "bid": guid,
                }))
            self.save_bill(bill)
Esempio n. 24
0
    def scrape_bill(self, chamber, session, bill_id, bill_type):
        url = '%s?r=%s' % (self.base_url, bill_id)
        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            # search for Titulo, accent over i messes up lxml, so use 'tulo'
            title = doc.xpath(u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()')
            if not title:
                raise NoSuchBill()
            bill = Bill(session, chamber, bill_id, title[0], type=bill_type)
            author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0]
            bill.add_sponsor('primary', author.strip())

            co_authors = doc.xpath(u'//td/b[contains(text(),"Co-autor")]/../text()')
            if len(co_authors) !=0:
                for co_author in co_authors[1].split(','):
                    bill.add_sponsor('cosponsor',co_author.strip());


            action_table = doc.xpath('//table')[-1]
            for row in action_table[1:]:
                tds = row.xpath('td')

                # ignore row missing date
                if len(tds) != 2:
                    continue

                date = datetime.datetime.strptime(tds[0].text_content(),
                                                  "%m/%d/%Y")

                action = tds[1].text_content().strip()
                #parse the text to see if it's a new version or a unrelated document
                #if has - let's *shrug* assume it's a vote document

                #get url of action
                action_url = tds[1].xpath('a/@href')

                #check it has a url and is not just text

                if action_url:
                    action_url = action_url[0]
                    #check if it's a version of the bill or another type of document.
                    #NOTE: not sure if new versions of the bill are only denoted with 'Entirillado' OR if that's the correct name but from what i gather it looks like it.
                    if re.match('Entirillado', action):
                        bill.add_version(action, action_url)
                    else:
                        bill.add_document(action, action_url)

                for pattern, atype in _classifiers:
                    if re.match(pattern, action):
                        break
                else:
                    atype = 'other'

                bill.add_action(chamber, action, date, type=atype)

                if atype == 'bill:passed' and action_url:
                    vote_chamber  = None
                    for pattern, vote_chamber in _voteChambers:
                       if re.match(pattern,action):
                           break
                    else:
                       self.warning('coudnt find voteChamber pattern')

                    if vote_chamber == 'lower' and len(action_url) > 0:
                        vote = self.scrape_votes(action_url, action,date,
                                                 vote_chamber)
                        if not vote[0] == None:
                            vote[0].add_source(action_url)
                            bill.add_vote(vote[0])
                        else:
                            self.warning('Problem Reading vote: %s,%s' %
                                         (vote[1], bill_id))

            bill.add_source(url)
            self.save_bill(bill)
Esempio n. 25
0
    def scrape_bills(self, chamber_to_scrape, session):
        url = 'http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml' % session

        with self.urlopen(url) as bill_dir_page:
            root = lxml.etree.fromstring(bill_dir_page, lxml.etree.HTMLParser())
            for mr in root.xpath('//lastaction/msrgroup'):
                bill_id = mr.xpath('string(measure)').replace(" ", "")
                if bill_id[0] == "S":
                    chamber = "upper"
                else:
                    chamber = "lower"

                bill_type = {'B':'bill', 'C': 'concurrent resolution',
                             'R': 'resolution', 'N': 'nomination'}[bill_id[1]]

                # just skip past bills that are of the wrong chamber
                if chamber != chamber_to_scrape:
                    continue

                link = mr.xpath('string(actionlink)').replace("..", "")
                main_doc = mr.xpath('string(measurelink)').replace("../../../", "")
                main_doc_url = 'http://billstatus.ls.state.ms.us/%s' % main_doc
                bill_details_url = 'http://billstatus.ls.state.ms.us/%s/pdf/%s' % (session, link)
                with self.urlopen(bill_details_url) as details_page:
                    details_page = details_page.decode('latin1').encode('utf8', 'ignore')
                    details_root = lxml.etree.fromstring(details_page, lxml.etree.HTMLParser())
                    title = details_root.xpath('string(//shorttitle)')
                    longtitle = details_root.xpath('string(//longtitle)')

                    bill = Bill(session, chamber, bill_id, title,
                                type=bill_type, longtitle=longtitle)

                    #sponsors
                    main_sponsor = details_root.xpath('string(//p_name)').split()
                    if main_sponsor:
                        main_sponsor = main_sponsor[0]
                        main_sponsor_link = details_root.xpath('string(//p_link)').replace(" ", "_")
                        main_sponsor_url =  'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, main_sponsor_link)
                        type = "primary"
                        bill.add_sponsor(type, main_sponsor, main_sponsor_url = main_sponsor_url)
                    for author in details_root.xpath('//authors/additional'):
                        leg = author.xpath('string(co_name)').replace(" ", "_")
                        if leg:
                            leg_url = 'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, leg)
                            type = "cosponsor"
                            bill.add_sponsor(type, leg, leg_url=leg_url)

                    #Versions 
                    curr_version = details_root.xpath('string(//current_other)').replace("../../../../", "")
                    curr_version_url = "http://billstatus.ls.state.ms.us/" + curr_version
                    bill.add_version("Current version", curr_version_url)

                    intro_version = details_root.xpath('string(//intro_other)').replace("../../../../", "")
                    intro_version_url = "http://billstatus.ls.state.ms.us/" + intro_version
                    bill.add_version("As Introduced", intro_version_url)

                    comm_version = details_root.xpath('string(//cmtesub_other)').replace("../../../../", "")
                    if comm_version.find("documents") != -1:
                        comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version
                        bill.add_version("Committee Substitute", comm_version_url)

                    passed_version = details_root.xpath('string(//passed_other)').replace("../../../../", "")
                    if passed_version.find("documents") != -1:
                        passed_version_url = "http://billstatus.ls.state.ms.us/" + passed_version
                        title = "As Passed the " + chamber
                        bill.add_version(title, passed_version_url)

                    asg_version = details_root.xpath('string(//asg_other)').replace("../../../../", "")
                    if asg_version.find("documents") != -1:
                        asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version
                        bill.add_version("Approved by the Governor", asg_version_url)

                    # avoid duplicate votes
                    seen_votes = set()

                    #Actions
                    for action in details_root.xpath('//history/action'):
                        action_num  = action.xpath('string(act_number)').strip()
                        action_num = int(action_num)
                        act_vote = action.xpath('string(act_vote)').replace("../../../..", "")
                        action_desc = action.xpath('string(act_desc)')
                        date, action_desc = action_desc.split(" ", 1)
                        date = date + "/" + session[0:4]
                        date = datetime.strptime(date, "%m/%d/%Y")

                        if action_desc.startswith("(H)"):
                            actor = "lower"
                            action = action_desc[4:]
                        elif action_desc.startswith("(S)"):
                            actor = "upper"
                            action = action_desc[4:]
                        else:
                            actor = "executive"
                            action = action_desc

                        if action.find("Veto") != -1:
                            version_path = details_root.xpath("string(//veto_other)")
                            version_path = version_path.replace("../../../../", "")
                            version_url = "http://billstatus.ls.state.ms.us/" + version_path
                            bill.add_document("Veto", version_url) 

                        atype = 'other'
                        for prefix, prefix_type in self._action_types:
                            if action.startswith(prefix):
                                atype = prefix_type
                                break

                        bill.add_action(actor, action, date, type=atype,
                                        action_num=action_num)

                        if act_vote:
                            vote_url = 'http://billstatus.ls.state.ms.us%s' % act_vote
                            if vote_url not in seen_votes:
                                seen_votes.add(vote_url)
                                vote = self.scrape_votes(vote_url, action,
                                                         date, actor)
                                vote.add_source(vote_url)
                                bill.add_vote(vote)

                    bill.add_source(bill_details_url)
                    self.save_bill(bill)
Esempio n. 26
0
    def scrape(self, chamber, session):
        year = self.metadata['session_details'][session]['start_date'].year
        main_url = 'http://nebraskalegislature.gov/bills/search_by_date.php?SessionDay=%s' % year

        with self.urlopen(main_url) as page:
            page = lxml.html.fromstring(page)

            for docs in page.xpath('/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[@class="cal_content_full"]/table[@id="bill_results"]/tr/td[1]/a'):
                bill_link = docs.attrib['href']
                bill_link = 'http://nebraskalegislature.gov/' + bill_link
                with self.urlopen(bill_link) as bill_page:
                    bill_page = lxml.html.fromstring(bill_page)

                    #basic info
                    long_title = bill_page.xpath('/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/h2')[0].text.split()
                    bill_id = long_title[0]
                    title = ''
                    for x in range(2, len(long_title)):
                        title += long_title[x] + ' '
                    title = title[0:-1]
                    #bill = Bill(session, chamber, bill_id, title)

                    #bill_type
                    if 'LR' in bill_id:
                        bill_type = 'resolution'
                    else:
                        bill_type = 'bill'

                    bill = Bill(session, chamber, bill_id, title, type = bill_type)
                    
                    #sources
                    bill.add_source(main_url)
                    bill.add_source(bill_link)
                    
                    #Sponsor
                    introduced_by = bill_page.xpath('/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[2]/table/tr[2]/td[1]/a[1]')[0].text
                    bill.add_sponsor('primary', introduced_by)

                    #actions
                    for actions in bill_page.xpath('/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[3]/table/tr[1]/td[1]/table/tr'):
                        date = actions[0].text
                        if 'Date' not in date:
                            date = datetime.strptime(date, '%b %d, %Y')
                            action = actions[1].text

                            if '-' in action:
                                vote_info = action.split()[-1].split('-')
                                yes_count = int(vote_info[0])
                                no_count = int(vote_info[1])
                                abstention_count = int(vote_info[2])
                                if yes_count > no_count:
                                    passed = True
                                else:
                                    passed = False
                                vote = Vote(chamber, date, action, passed, yes_count, no_count, abstention_count)
                                vote.add_source(bill_link)
                                bill.add_vote(vote)

                            if 'Governor' in action:
                                actor = 'Governor'
                            elif 'Speaker' in action:
                                actor = 'Speaker'
                            else:
                                actor = chamber

                            action_type = self.action_types(action)
                            bill.add_action(actor, action, date, action_type)
                    
                    #versions
                    for versions in bill_page.xpath('/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[2]/table/tr[2]/td[2]/a'):
                        version_url = versions.attrib['href']
                        version_url = 'http://nebraskalegislature.gov/' + version_url[3:len(version_url)]
                        version_name = versions.text
                        bill.add_version(version_name, version_url)

                        
                    #documents
                    #additional_info
                    for additional_info in bill_page.xpath('/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[2]/table/tr[2]/td/a'):
                        document_name = additional_info.text
                        document_url = additional_info.attrib['href']
                        document_url = 'http://nebraskalegislature.gov/' + document_url[3:len(document_url)]
                        if '.pdf' in document_url:
                            bill.add_document(document_name, document_url)

                    #amendments
                    for admendments in bill_page.xpath('/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[3]/table/tr[1]/td[2]/table/tr/td/a'):
                        admendment_name = admendments.text
                        admendment_url = admendments.attrib['href']
                        admendment_url = 'http://nebraskalegislature.gov/' + admendment_url[3:len(admendment_url)]
                        bill.add_document(admendment_name, admendment_url)

                    #related transcripts
                    for transcripts in bill_page.xpath('/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[3]/table/tr[2]/td[2]/a'):
                        transcript_name = transcripts.text
                        transcript_url = transcripts.attrib['href']
                        bill.add_document(transcript_name, transcript_url)

                    self.save_bill(bill)
Esempio n. 27
0
    def scrape_bill_pages(self, session, year_abr):
        """ assemble information on a bill from a number of DBF files
        """

        #Main Bill information
        main_bill_url, main_bill_db = self.get_dbf(year_abr, 'MAINBILL')

        # keep a dictionary of bills (mapping bill_id to Bill obj)
        bill_dict = {}

        for rec in main_bill_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["synopsis"]
            if bill_type[0] == 'A':
                chamber = "lower"
            else:
                chamber = "upper"

            # some bills have a blank title.. just skip it
            if not title:
                continue

            bill = Bill(str(session), chamber, bill_id, title,
                        type=self._bill_types[bill_type[1:]])
            bill.add_source(main_bill_url)
            bill_dict[bill_id] = bill

        #Sponsors
        bill_sponsors_url, bill_sponsors_db = self.get_dbf(year_abr, 'BILLSPON')

        for rec in bill_sponsors_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            name = rec["sponsor"]
            sponsor_type = rec["type"]
            if sponsor_type == 'P':
                sponsor_type = "Primary"
            else:
                sponsor_type = "Co-sponsor"
            bill.add_sponsor(sponsor_type, name)


        #Documents
        bill_document_url, bill_document_db = self.get_dbf(year_abr, 'BILLWP')

        #print bill_document_db[2]
        for rec in bill_document_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            document = rec["document"]
            document = document.split('\\')
            document = document[-2] + "/" + document[-1]
            year = str(year_abr) + str((year_abr + 1))

            #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document)
            htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (year_abr,
                document.replace('.DOC', '.HTM'))

            # name document based _doctype
            try:
                doc_name = self._doctypes[rec['doctype']]
            except KeyError:
                raise Exception('unknown doctype %s on %s' %
                                (rec['doctype'], bill_id))
            if rec['comment']:
                doc_name += ' ' + rec['comment']

            if rec['doctype'] in self._version_types:
                bill.add_version(doc_name, htm_url)
            else:
                bill.add_document(doc_name, htm_url)

        # Votes
        next_year = int(year_abr)+1
        vote_info_list = ['A%s' % year_abr,
                          'A%s' % next_year,
                          'S%s' % year_abr,
                          'S%s' % next_year,
                          'CA%s-%s' % (year_abr, next_year),
                          'CS%s-%s' % (year_abr, next_year),
                         ]

        for filename in vote_info_list:
            s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename
            s_vote_zip, resp = self.urlretrieve(s_vote_url)
            zipedfile = zipfile.ZipFile(s_vote_zip)
            vfile = "%s.txt" % filename
            vote_file = zipedfile.open(vfile, 'U')
            vdict_file = csv.DictReader(vote_file)

            votes = {}
            if filename.startswith('A') or filename.startswith('CA'):
                chamber = "lower"
            else:
                chamber = "upper"

            if filename.startswith('C'):
                vote_file_type = 'committee'
            else:
                vote_file_type = 'chamber'

            for rec in vdict_file:

                if vote_file_type == 'chamber':
                    bill_id = rec["Bill"].strip()
                    leg = rec["Full_Name"]

                    date = rec["Session_Date"]
                    action = rec["Action"]
                    leg_vote = rec["Legislator_Vote"]
                else:
                    bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number'])
                    leg = rec['Name']
                    # drop time portion
                    date = rec['Agenda_Date'].split()[0]
                    # make motion readable
                    action = self._com_vote_motions[rec['BillAction']]
                    # first char (Y/N) use [0:1] to ignore ''
                    leg_vote = rec['LegislatorVote'][0:1]

                date = datetime.strptime(date, "%m/%d/%Y")
                vote_id = '_'.join((bill_id, chamber, action))
                vote_id = vote_id.replace(" ", "_")

                if vote_id not in votes:
                    votes[vote_id] = Vote(chamber, date, action, None, None,
                                          None, None, bill_id=bill_id)
                if vote_file_type == 'committee':
                    votes[vote_id]['committee'] = self._committees[
                        rec['Committee_House']]

                if leg_vote == "Y":
                    votes[vote_id].yes(leg)
                elif leg_vote == "N":
                    votes[vote_id].no(leg)
                else:
                    votes[vote_id].other(leg)

            # remove temp file
            os.remove(s_vote_zip)

            #Counts yes/no/other votes and saves overall vote
            for vote in votes.itervalues():
                vote_yes_count = len(vote["yes_votes"])
                vote_no_count = len(vote["no_votes"])
                vote_other_count = len(vote["other_votes"])
                vote["yes_count"] = vote_yes_count
                vote["no_count"] = vote_no_count
                vote["other_count"] = vote_other_count
                if vote_yes_count > vote_no_count:
                    vote["passed"] = True
                else:
                    vote["passed"] = False
                vote_bill_id = vote["bill_id"]
                bill = bill_dict[vote_bill_id]
                bill.add_vote(vote)

        #Actions
        bill_action_url, bill_action_db = self.get_dbf(year_abr, 'BILLHIST')


        for rec in bill_action_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            action = rec["action"]
            date = rec["dateaction"]
            actor = rec["house"]
            comment = rec["comment"]
            action, atype = self.categorize_action(action)
            if comment:
                action += (' ' + comment)
            bill.add_action(actor, action, date, type=atype)

        # Subjects
        subject_url, subject_db = self.get_dbf(year_abr, 'BILLSUBJ')
        for rec in subject_db:
            bill_id = rec['billtype'] + str(int(rec['billnumber']))
            bill = bill_dict.get(bill_id)
            if bill:
                bill.setdefault('subjects', []).append(rec['subjectkey'])
            else:
                self.warning('invalid bill id in BILLSUBJ.DBF: %s' % bill_id)

        # save all bills at the end
        for bill in bill_dict.itervalues():
            # add sources
            bill.add_source(bill_sponsors_url)
            bill.add_source(bill_document_url)
            bill.add_source(bill_action_url)
            bill.add_source(subject_url)
            self.save_bill(bill)
Esempio n. 28
0
    def scrape(self, session, chambers):
        HTML_TAGS_RE = r'<.*?>'

        year_slug = session[5:]

        # Load all bills and resolutions via the private API
        bills_url = \
                'http://legislature.vermont.gov/bill/loadBillsReleased/{}/'.\
                format(year_slug)
        bills_json = self.get(bills_url).text
        bills = json.loads(bills_json)['data'] or []

        bills_url = \
                'http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/'.\
                format(year_slug)
        bills_json = self.get(bills_url).text
        bills.extend(json.loads(bills_json)['data'] or [])

        resolutions_url = \
                'http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both'.\
                format(year_slug)
        resolutions_json = self.get(resolutions_url).text
        bills.extend(json.loads(resolutions_json)['data'] or [])

        # Parse the information from each bill
        for info in bills:
            # Strip whitespace from strings
            info = {k: v.strip() for k, v in info.iteritems()}

            # Identify the bill type and chamber
            if info['BillNumber'].startswith('J.R.H.'):
                bill_type = 'joint resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('J.R.S.'):
                bill_type = 'joint resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('H.C.R.'):
                bill_type = 'concurrent resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.C.R.'):
                bill_type = 'concurrent resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('H.R.'):
                bill_type = 'resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.R.'):
                bill_type = 'resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('PR.'):
                bill_type = 'constitutional amendment'
                if info['Body'] == 'H':
                    bill_chamber = 'lower'
                elif info['Body'] == 'S':
                    bill_chamber = 'upper'
                else:
                    raise AssertionError("Amendment not tied to chamber")

            elif info['BillNumber'].startswith('H.'):
                bill_type = 'bill'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.'):
                bill_type = 'bill'
                bill_chamber = 'upper'

            else:
                raise AssertionError("Unknown bill type found: '{}'".format(
                    info['BillNumber']))

            # Create the bill using its basic information
            bill = Bill(session=session,
                        bill_id=info['BillNumber'],
                        title=info['Title'],
                        chamber=bill_chamber,
                        type=bill_type)
            if 'resolution' in bill_type:
                bill.add_source(resolutions_url)
            else:
                bill.add_source(bills_url)

            # Load the bill's information page to access its metadata
            bill_url = \
                    'http://legislature.vermont.gov/bill/status/{0}/{1}'.\
                    format(year_slug, info['BillNumber'])
            doc = self.lxmlize(bill_url)
            bill.add_source(bill_url)

            # Capture sponsors
            sponsors = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/'
                'following-sibling::dd[1]/ul/li')
            sponsor_type = 'primary'
            for sponsor in sponsors:
                if sponsor.xpath('span/text()') == ['Additional Sponsors']:
                    sponsor_type = 'cosponsor'
                    continue

                sponsor_name = sponsor.xpath('a/text()')[0].\
                        replace("Rep.", "").replace("Sen.", "").strip()
                if sponsor_name and not \
                        (sponsor_name[ :5] == "Less" and len(sponsor_name) == 5):
                    bill.add_sponsor(sponsor_type, sponsor_name)

            # Capture bill text versions
            versions = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/'
                'following-sibling::dd[1]/ul/li/a')
            for version in versions:
                bill.add_version(name=version.xpath('text()')[0],
                                 url=version.xpath('@href')[0].replace(
                                     ' ', '%20'),
                                 mimetype='application/pdf')

            # Identify the internal bill ID, used for actions and votes
            # If there is no internal bill ID, then it has no extra information
            try:
                internal_bill_id = re.search(
                    r'"bill/loadBillDetailedStatus/{}/(\d+)"'.format(
                        year_slug), lxml.etree.tostring(doc)).group(1)
            except AttributeError:
                self.warning("Bill {} appears to have no activity".\
                        format(info['BillNumber']))
                self.save_bill(bill)
                continue

            # Capture actions
            actions_url = 'http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}'.\
                    format(year_slug, internal_bill_id)
            actions_json = self.get(actions_url).text
            actions = json.loads(actions_json)['data']
            bill.add_source(actions_url)

            chambers_passed = set()
            for action in actions:
                action = {k: v.strip() for k, v in action.iteritems()}

                if "Signed by Governor" in action['FullStatus']:
                    actor = 'governor'
                elif action['ChamberCode'] == 'H':
                    actor = 'lower'
                elif action['ChamberCode'] == 'S':
                    actor = 'upper'
                else:
                    raise AssertionError("Unknown actor for bill action")

                # Categorize action
                if "Signed by Governor" in action['FullStatus']:
                    assert chambers_passed == set("HS")
                    action_type = 'governor:signed'
                elif actor == 'lower' and \
                        any(x.lower().startswith('aspassed') for x in action['keywords'].split(';')):
                    action_type = 'bill:passed'
                    chambers_passed.add("H")
                elif actor == 'upper' and \
                        any(x.lower().startswith(' aspassed') or x.lower().startswith('aspassed') for x in action['keywords'].split(';')):
                    action_type = 'bill:passed'
                    chambers_passed.add("S")
                else:
                    action_type = 'other'

                bill.add_action(actor=actor,
                                action=re.sub(HTML_TAGS_RE, "",
                                              action['FullStatus']),
                                date=datetime.datetime.strptime(
                                    action['StatusDate'], '%m/%d/%Y'),
                                type=action_type)

            # Capture votes
            votes_url = 'http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}'.\
                    format(year_slug, internal_bill_id)
            votes_json = self.get(votes_url).text
            votes = json.loads(votes_json)['data']
            bill.add_source(votes_url)

            for vote in votes:
                roll_call_id = vote['VoteHeaderID']
                roll_call_url = 'http://legislature.vermont.gov/bill/loadBillRollCallDetails/{0}/{1}'.\
                        format(year_slug, roll_call_id)
                roll_call_json = self.get(roll_call_url).text
                roll_call = json.loads(roll_call_json)['data']

                roll_call_yea = []
                roll_call_nay = []
                roll_call_other = []
                for member in roll_call:
                    (member_name,
                     _district) = member['MemberName'].split(" of ")
                    member_name = member_name.strip()

                    if member['MemberVote'] == "Yea":
                        roll_call_yea.append(member_name)
                    elif member['MemberVote'] == "Nay":
                        roll_call_nay.append(member_name)
                    else:
                        roll_call_other.append(member_name)

                if "Passed -- " in vote['FullStatus']:
                    did_pass = True
                elif "Failed -- " in vote['FullStatus']:
                    did_pass = False
                else:
                    raise AssertionError("Roll call vote result is unclear")

                # Check vote counts
                yea_count = \
                        int(re.search(r'Yeas = (\d+)', vote['FullStatus']).group(1))
                nay_count = \
                        int(re.search(r'Nays = (\d+)', vote['FullStatus']).group(1))

                vote_to_add = Vote(chamber=('lower' if vote['ChamberCode']
                                            == 'H' else 'upper'),
                                   date=datetime.datetime.strptime(
                                       vote['StatusDate'], '%m/%d/%Y'),
                                   motion=re.sub(HTML_TAGS_RE, "",
                                                 vote['FullStatus']).strip(),
                                   passed=did_pass,
                                   yes_count=yea_count,
                                   no_count=nay_count,
                                   other_count=len(roll_call_other))
                vote_to_add.add_source(roll_call_url)

                for member in roll_call_yea:
                    vote_to_add.yes(member)
                for member in roll_call_nay:
                    vote_to_add.no(member)
                for member in roll_call_other:
                    vote_to_add.other(member)

                try:
                    vote_to_add.validate()
                except ValueError as e:
                    self.warning(e)

                bill.add_vote(vote_to_add)

            # Capture extra information
            # This is not in the OpenStates spec, but is available
            # Not yet implemented
            # Witnesses: http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id}
            # Conference committee members: http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number}
            # Committee meetings: http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id}

            self.save_bill(bill)
Esempio n. 29
0
    def scrape(self, chamber, session):
        self.all_bills = {}
        self.slug = self.metadata['session_details'][session]['slug']

        page = self.lxmlize(self.bill_directory_url.format(self.slug.upper()))
        page.make_links_absolute(self.base_url)

        ulid = 'senateBills' if chamber == 'upper' else 'houseBills'  # id of <ul>
        header = page.xpath("//ul[@id='{0}_search']".format(ulid))[0]

        #Every ul with a data-load-action and an id
        bill_list_pages = header.xpath(".//ul[boolean(@data-load-action)"
                                       " and boolean(@id)]/@data-load-action")

        bill_anchors = []

        for bill_list_url in bill_list_pages:
            bill_list_page = self.lxmlize('{}{}'.format(
                self.base_url, bill_list_url))
            bill_list_page.make_links_absolute(self.base_url)
            bill_anchors.extend(bill_list_page.xpath('//a') or [])

        ws = re.compile(r"\s+")

        def _clean_ws(txt):
            """Remove extra whitespace from text."""
            return ws.sub(' ', txt).strip()

        for a in bill_anchors:
            bid = ws.sub('', a.text_content())  # bill id
            bill_summary = _clean_ws(a.get('title'))
            # bill title is added below
            bill = Bill(session, chamber, bid, title='', summary=bill_summary)
            page = self.lxmlize(a.get('href'))
            versions = page.xpath('//ul[@class="dropdown-menu"]/li/span/' +
                                  'a[contains(@title, "Get the Pdf")]/@href')

            measure_info = {}
            info = page.xpath("//table[@id='measureOverviewTable']/tr")
            for row in info:
                key, value = row.xpath("./*")
                key = key.text.replace(':', '').strip()
                measure_info[key] = value

            for sponsor in measure_info['Chief Sponsors'].xpath("./a"):
                if sponsor.text_content().strip():
                    bill.add_sponsor(type='primary',
                                     name=sponsor.text_content())

            for sponsor in measure_info['Regular Sponsors'].xpath("./a"):
                if sponsor.text_content().strip():
                    bill.add_sponsor(type='cosponsor',
                                     name=sponsor.text_content())

            title = _clean_ws(measure_info['Bill Title'].text_content())
            # some bill titles need to be added manually
            if self.slug == "2013R1" and bid == "HB2010":
                title = ("Relating to Water Resources Department contested"
                         "case proceedings.")
            bill['title'] = title

            for version in versions:
                name = version.split("/")[-1]
                bill.add_version(name=name,
                                 url=version,
                                 mimetype='application/pdf')

            history_url = self.create_url(
                'Measures/Overview/GetHistory/{bill}', bid)
            history = self.lxmlize(history_url).xpath("//table/tr")
            for entry in history:
                wwhere, action = [
                    _clean_ws(x.text_content()) for x in entry.xpath("*")
                ]
                vote_cleaning_re = r'(.*?)((Ayes)|(Nays),\s.*)'
                if re.match(vote_cleaning_re, action):
                    action = re.search(vote_cleaning_re, action).groups()[0]
                wwhere = re.match(r"(?P<when>.*) \((?P<where>.*)\)",
                                  wwhere).groupdict()

                action_chamber = {"S": "upper", "H": "lower"}[wwhere['where']]
                when = "%s-%s" % (self.slug[:4], wwhere['when'])
                when = dt.datetime.strptime(when, "%Y-%m-%d")

                types = []
                for expr, types_ in self.action_classifiers:
                    m = re.match(expr, action)
                    if m:
                        types += types_

                if types == []:
                    types = ['other']

                # actor, action, date, type, committees, legislators
                bill.add_action(action_chamber, action, when, type=types)

                # Parse and store Vote information
                vote_id = entry.xpath('./td/a[contains(@href, "otes-")]/@href')
                if not vote_id:
                    continue
                elif "#measureVotes-" in vote_id[0]:
                    vote_id = vote_id[0].split("-")[-1]
                    vote_url = "https://olis.leg.state.or.us/liz/" + \
                            "{0}/Measures/MeasureVotes?id={1}". \
                            format(self.slug, vote_id)
                else:
                    vote_id = vote_id[0].split("-")[-1]
                    vote_url = "https://olis.leg.state.or.us/liz/" + \
                            "{0}/CommitteeReports/MajorityReport/{1}". \
                            format(self.slug, vote_id)

                votes = self._get_votes(vote_url)
                if not any(len(x) for x in votes.values()):
                    self.warning("The votes webpage was empty for " +
                                 "action {0} on bill {1}.".format(action, bid))
                    continue

                passed = (float(len(votes["yes_votes"])) /
                          (len(votes["yes_votes"]) + len(votes["no_votes"])) >
                          0.5)

                vote = Vote(chamber=chamber,
                            date=when,
                            motion=action,
                            passed=passed,
                            yes_count=len(votes["yes_votes"]),
                            no_count=len(votes["no_votes"]),
                            other_count=len(votes["other_votes"]),
                            session=session,
                            bill_id=bid,
                            bill_chamber=action_chamber)

                vote.update(votes)
                bill_url = "https://olis.leg.state.or.us/liz/" + \
                        "{0}/Measures/Overview/{1}".format(self.slug, bid)
                vote.add_source(bill_url)

                bill.add_vote(vote)

            amendments_url = self.create_url(
                'Measures/ProposedAmendments/{bill}', bid)
            amendments = self.lxmlize(amendments_url).xpath(
                "//div[@id='amendments']/table//tr")

            for amendment in amendments:
                nodes = amendment.xpath("./td")

                if nodes == []:
                    continue

                pdf_href, date, committee, adopted, when = nodes
                pdf_href, = pdf_href.xpath("./a")
                pdf_link = pdf_href.attrib['href']

                name = "Ammendment %s" % (pdf_href.text_content())

                adopted = adopted.text
                bill.add_document(name=name,
                                  url=pdf_link,
                                  adopted=adopted,
                                  mimetype='application/pdf')

            bill.add_source(a.get('href'))
            self.save_bill(bill)
Esempio n. 30
0
    def scrape_bill_pages(self, session, year_abr):
        """ assemble information on a bill from a number of DBF files
        """

        # Main Bill information
        main_bill_url, main_bill_db = self.get_dbf(year_abr, "MAINBILL")

        # keep a dictionary of bills (mapping bill_id to Bill obj)
        bill_dict = {}

        for rec in main_bill_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["synopsis"]
            if bill_type[0] == "A":
                chamber = "lower"
            else:
                chamber = "upper"

            # some bills have a blank title.. just skip it
            if not title:
                continue

            bill = Bill(str(session), chamber, bill_id, title, type=self._bill_types[bill_type[1:]])
            bill.add_source(main_bill_url)
            bill_dict[bill_id] = bill

        # Sponsors
        bill_sponsors_url, bill_sponsors_db = self.get_dbf(year_abr, "BILLSPON")

        for rec in bill_sponsors_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            name = rec["sponsor"]
            sponsor_type = rec["type"]
            if sponsor_type == "P":
                sponsor_type = "Primary"
            else:
                sponsor_type = "Co-sponsor"
            bill.add_sponsor(sponsor_type, name)

        # Documents
        bill_document_url, bill_document_db = self.get_dbf(year_abr, "BILLWP")

        # print bill_document_db[2]
        for rec in bill_document_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning("unknown bill %s in document database" % bill_id)
                continue
            bill = bill_dict[bill_id]
            document = rec["document"]
            document = document.split("\\")
            document = document[-2] + "/" + document[-1]
            year = str(year_abr) + str((year_abr + 1))

            # doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document)
            htm_url = "http://www.njleg.state.nj.us/%s/Bills/%s" % (year_abr, document.replace(".DOC", ".HTM"))

            # name document based _doctype
            try:
                doc_name = self._doctypes[rec["doctype"]]
            except KeyError:
                raise Exception("unknown doctype %s on %s" % (rec["doctype"], bill_id))
            if rec["comment"]:
                doc_name += " " + rec["comment"]

            if rec["doctype"] in self._version_types:
                bill.add_version(doc_name, htm_url)
            else:
                bill.add_document(doc_name, htm_url)

        # Votes
        next_year = int(year_abr) + 1
        vote_info_list = [
            "A%s" % year_abr,
            "A%s" % next_year,
            "S%s" % year_abr,
            "S%s" % next_year,
            "CA%s-%s" % (year_abr, next_year),
            "CS%s-%s" % (year_abr, next_year),
        ]

        for filename in vote_info_list:
            s_vote_url = "ftp://www.njleg.state.nj.us/votes/%s.zip" % filename
            try:
                s_vote_zip, resp = self.urlretrieve(s_vote_url)
            except scrapelib.FTPError:
                self.warning("could not find %s" % s_vote_url)
                continue
            zipedfile = zipfile.ZipFile(s_vote_zip)
            vfile = "%s.txt" % filename
            vote_file = zipedfile.open(vfile, "U")
            vdict_file = csv.DictReader(vote_file)

            votes = {}
            if filename.startswith("A") or filename.startswith("CA"):
                chamber = "lower"
            else:
                chamber = "upper"

            if filename.startswith("C"):
                vote_file_type = "committee"
            else:
                vote_file_type = "chamber"

            for rec in vdict_file:

                if vote_file_type == "chamber":
                    bill_id = rec["Bill"].strip()
                    leg = rec["Full_Name"]

                    date = rec["Session_Date"]
                    action = rec["Action"]
                    leg_vote = rec["Legislator_Vote"]
                else:
                    bill_id = "%s%s" % (rec["Bill_Type"], rec["Bill_Number"])
                    leg = rec["Name"]
                    # drop time portion
                    date = rec["Agenda_Date"].split()[0]
                    # make motion readable
                    action = self._com_vote_motions[rec["BillAction"]]
                    # first char (Y/N) use [0:1] to ignore ''
                    leg_vote = rec["LegislatorVote"][0:1]

                date = datetime.strptime(date, "%m/%d/%Y")
                vote_id = "_".join((bill_id, chamber, action))
                vote_id = vote_id.replace(" ", "_")

                if vote_id not in votes:
                    votes[vote_id] = Vote(chamber, date, action, None, None, None, None, bill_id=bill_id)
                if vote_file_type == "committee":
                    votes[vote_id]["committee"] = self._committees[rec["Committee_House"]]

                if leg_vote == "Y":
                    votes[vote_id].yes(leg)
                elif leg_vote == "N":
                    votes[vote_id].no(leg)
                else:
                    votes[vote_id].other(leg)

            # remove temp file
            os.remove(s_vote_zip)

            # Counts yes/no/other votes and saves overall vote
            for vote in votes.itervalues():
                vote_yes_count = len(vote["yes_votes"])
                vote_no_count = len(vote["no_votes"])
                vote_other_count = len(vote["other_votes"])
                vote["yes_count"] = vote_yes_count
                vote["no_count"] = vote_no_count
                vote["other_count"] = vote_other_count
                if vote_yes_count > vote_no_count:
                    vote["passed"] = True
                else:
                    vote["passed"] = False
                vote_bill_id = vote["bill_id"]
                bill = bill_dict[vote_bill_id]
                bill.add_vote(vote)

        # Actions
        bill_action_url, bill_action_db = self.get_dbf(year_abr, "BILLHIST")
        actor_map = {"A": "lower", "G": "executive", "S": "upper"}

        for rec in bill_action_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            action = rec["action"]
            date = rec["dateaction"]
            actor = actor_map[rec["house"]]
            comment = rec["comment"]
            action, atype = self.categorize_action(action)
            if comment:
                action += " " + comment
            bill.add_action(actor, action, date, type=atype)

        # Subjects
        subject_url, subject_db = self.get_dbf(year_abr, "BILLSUBJ")
        for rec in subject_db:
            bill_id = rec["billtype"] + str(int(rec["billnumber"]))
            bill = bill_dict.get(bill_id)
            if bill:
                bill.setdefault("subjects", []).append(rec["subjectkey"])
            else:
                self.warning("invalid bill id in BILLSUBJ.DBF: %s" % bill_id)

        # save all bills at the end
        for bill in bill_dict.itervalues():
            # add sources
            bill.add_source(bill_sponsors_url)
            bill.add_source(bill_document_url)
            bill.add_source(bill_action_url)
            bill.add_source(subject_url)
            self.save_bill(bill)
Esempio n. 31
0
class SenateBillPage(object):
    '''Used for categories, senate votes, events.'''

    def __init__(self, scraper, session, chamber, url, doc, bill_type,
                 bill_id, title, bill_id_parts):
        self.scraper = scraper
        self.chamber = chamber
        self.url = url
        self.doc = doc
        self.bill_id = bill_id
        self.letter, self.number, self.version = bill_id_parts
        self.data = {}
        self.bill = Bill(session, chamber, bill_id, title, type=bill_type)
        self.succeeded = False

        self._build()

        self.bill.add_source(self.url)

    def _build(self):
        self.get_senate_votes()
        self.get_sponsors_memo()
        self.get_subjects()
        self.get_versions()
        self.succeeded = True

    def url2lxml(self, url):
        self.bill.add_source(url)
        return self.scraper.url2lxml(url)

    def get_subjects(self):
        subjects = []
        for link in self.doc.xpath("//a[contains(@href, 'lawsection')]"):
            subjects.append(link.text.strip())

        self.bill['subjects'] = subjects

    def get_sponsors_memo(self):
        if self.chamber == 'upper':
            self.bill.add_document("Sponsor's Memorandum", self.url)

    def get_senate_votes(self):
        for b in self.doc.xpath("//div/b[starts-with(., 'VOTE: FLOOR VOTE:')]"):
            date = b.text.split('-')[1].strip()
            date = datetime.datetime.strptime(date, "%b %d, %Y").date()

            yes_votes, no_votes, other_votes = [], [], []
            yes_count, no_count, other_count = 0, 0, 0
            actual_vote = collections.defaultdict(list)

            vtype = None
            for tag in b.xpath("following-sibling::blockquote/*"):
                if tag.tag == 'b':
                    text = tag.text
                    if text.startswith('Ayes'):
                        vtype = 'yes'
                        yes_count = int(re.search(
                            r'\((\d+)\):', text).group(1))
                    elif text.startswith('Nays'):
                        vtype = 'no'
                        no_count = int(re.search(
                            r'\((\d+)\):', text).group(1))
                    elif (text.startswith('Excused') or
                          text.startswith('Abstain') or
                          text.startswith('Absent')
                         ):
                        vtype = 'other'
                        other_count += int(re.search(
                            r'\((\d+)\):', text).group(1))
                    else:
                        raise ValueError('bad vote type: %s' % tag.text)
                elif tag.tag == 'a':
                    name = tag.text.strip()
                    if vtype == 'yes':
                        yes_votes.append(name)
                    elif vtype == 'no':
                        no_votes.append(name)
                    elif vtype == 'other':
                        other_votes.append((name, tag.text))

            passed = yes_count > (no_count + other_count)

            vote = Vote('upper', date, 'Floor Vote', passed, yes_count,
                        no_count, other_count)

            for name in yes_votes:
                vote.yes(name)
            for name in no_votes:
                vote.no(name)
            for name, vote_val in other_votes:
                vote.other(name)
                actual_vote[vote_val].append(name)

            vote['actual_vote'] = actual_vote
            vote.add_source(self.url)
            self.bill.add_vote(vote)

        for b in self.doc.xpath("//div/b[starts-with(., 'VOTE: COMMITTEE VOTE:')]"):
            _, committee, date = re.split(r'\s*\t+\s*-\s*', b.text)
            date = date.strip()
            date = datetime.datetime.strptime(date, "%b %d, %Y").date()

            yes_votes, no_votes, other_votes = [], [], []
            yes_count, no_count, other_count = 0, 0, 0

            vtype = None
            for tag in b.xpath("following-sibling::blockquote/*"):
                if tag.tag == 'b':
                    text = tag.text
                    if text.startswith('Ayes'):
                        vtype = 'yes'
                        yes_count += int(re.search(
                            r'\((\d+)\):', text).group(1))
                    elif text.startswith('Nays'):
                        vtype = 'no'
                        no_count += int(re.search(
                            r'\((\d+)\):', text).group(1))
                    elif (text.startswith('Excused') or
                          text.startswith('Abstain') or
                          text.startswith('Absent')
                         ):
                        vtype = 'other'
                        other_count += int(re.search(
                            r'\((\d+)\):', text).group(1))
                    else:
                        raise ValueError('bad vote type: %s' % tag.text)
                elif tag.tag == 'a':
                    name = tag.text.strip()
                    if vtype == 'yes':
                        yes_votes.append(name)
                    elif vtype == 'no':
                        no_votes.append(name)
                    elif vtype == 'other':
                        other_votes.append(name)

            passed = yes_count > (no_count + other_count)

            vote = Vote('upper', date, '%s Committee Vote' % committee,
                        passed, yes_count, no_count, other_count)

            for name in yes_votes:
                vote.yes(name)
            for name in no_votes:
                vote.no(name)
            for name in other_votes:
                vote.other(name)

            vote.add_source(self.url)
            self.bill.add_vote(vote)

    def get_versions(self):
        text = self.doc.xpath('//*[contains(., "Versions:")]')[-1].text_content()
        version_text = text
        _, version_text = text.split('Versions:')

        url_tmpl = 'http://open.nysenate.gov/legislation/bill/'
        for version_bill_id in re.findall('\S+', version_text):
            version_bill_id_noyear, _ = version_bill_id.rsplit('-')
            version_url = url_tmpl + version_bill_id
            self.bill.add_version(version_bill_id_noyear, version_url,
                                  mimetype='text/html')
Esempio n. 32
0
    def scrape_bill_type(self, chamber, session, bill_type, type_abbr):
        if chamber == 'upper':
            chamber_name = 'SENATE'
        else:
            chamber_name = 'ASSEMBLY'

        bills = self.session.query(CABill).filter_by(
            session_year=session).filter_by(
            measure_type=type_abbr)


        for bill in bills:
            bill_session = session
            if bill.session_num != '0':
                bill_session += ' Special Session %s' % bill.session_num

            bill_id = bill.short_bill_id

            fsbill = Bill(bill_session, chamber, bill_id, '')

            # Construct session for web query, going from '20092010' to '0910'
            source_session = session[2:4] + session[6:8]

            # Turn 'AB 10' into 'ab_10'
            source_num = "%s_%s" % (bill.measure_type.lower(),
                                    bill.measure_num)

            # Construct a fake source url
            source_url = ("http://www.leginfo.ca.gov/cgi-bin/postquery?"
                          "bill_number=%s&sess=%s" %
                          (source_num, source_session))

            fsbill.add_source(source_url)

            scraped_versions = self.scrape_site_versions(bill, source_url)

            title = ''
            short_title = ''
            type = ['bill']
            subject = ''
            all_titles = set()
            i = 0
            for version in bill.versions:
                if not version.bill_xml:
                    continue

                title = clean_title(version.title)
                all_titles.add(title)
                short_title = clean_title(version.short_title)
                type = [bill_type]

                if version.appropriation == 'Yes':
                    type.append('appropriation')
                if version.fiscal_committee == 'Yes':
                    type.append('fiscal committee')
                if version.local_program == 'Yes':
                    type.append('local program')
                if version.urgency == 'Yes':
                    type.append('urgency')
                if version.taxlevy == 'Yes':
                    type.append('tax levy')

                if version.subject:
                    subject = clean_title(version.subject)

                date = version.bill_version_action_date.date()

                url = ''
                try:
                    scraped_version = scraped_versions[i]
                    if scraped_version[0] == date:
                        url = scraped_version[1]
                        i += 1
                except IndexError:
                    pass

                fsbill.add_version(
                    version.bill_version_id, url,
                    date=date,
                    title=title,
                    short_title=short_title,
                    subject=[subject],
                    type=type)

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill['title'] = title
            fsbill['short_title'] = short_title
            fsbill['type'] = type
            fsbill['subjects'] = [subject]

            # We don't want the current title in alternate_titles
            all_titles.remove(title)

            fsbill['alternate_titles'] = list(all_titles)

            for author in version.authors:
                if author.house == chamber_name:
                    fsbill.add_sponsor(author.contribution, author.name)

            introduced = False

            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r'(Assembly|Senate)($| \(Floor)', actor)
                if match:
                    actor = {'Assembly': 'lower',
                             'Senate': 'upper'}[match.group(1)]
                elif actor.startswith('Governor'):
                    actor = 'executive'
                else:
                    actor = re.sub('^Assembly', 'lower', actor)
                    actor = re.sub('^Senate', 'upper', actor)

                type = []

                act_str = action.action
                act_str = re.sub(r'\s+', ' ', act_str)

                if act_str.startswith('Introduced'):
                    introduced = True
                    type.append('bill:introduced')

                if 'Read first time.' in act_str:
                    if not introduced:
                        type.append('bill:introduced')
                        introduced = True
                    type.append('bill:reading:1')

                if 'To Com' in act_str or 'referred to' in act_str.lower():
                    type.append('committee:referred')

                if 'Read third time.  Passed.' in act_str:
                    type.append('bill:passed')

                if 'Approved by Governor' in act_str:
                    type.append('governor:signed')

                if 'Item veto' in act_str:
                    type.append('governor:vetoed:line-item')

                if 'Vetoed by Governor' in act_str:
                    type.append('governor:vetoed')

                if 'To Governor' in act_str:
                    type.append('governor:received')

                if 'Read second time' in act_str:
                    type.append('bill:reading:2')

                if not type:
                    type = ['other']

                fsbill.add_action(actor, act_str, action.action_date.date(),
                                  type=type)

            for vote in bill.votes:
                if vote.vote_result == '(PASS)':
                    result = True
                else:
                    result = False

                full_loc = vote.location.description
                first_part = full_loc.split(' ')[0].lower()
                if first_part in ['asm', 'assembly']:
                    vote_chamber = 'lower'
                    vote_location = ' '.join(full_loc.split(' ')[1:])
                elif first_part.startswith('sen'):
                    vote_chamber = 'upper'
                    vote_location = ' '.join(full_loc.split(' ')[1:])
                else:
                    raise ScrapeError("Bad location: %s" % full_loc)

                motion = vote.motion.motion_text or ''

                if "Third Reading" in motion or "3rd Reading" in motion:
                    vtype = 'passage'
                elif "Do Pass" in motion:
                    vtype = 'passage'
                else:
                    vtype = 'other'

                motion = motion.strip()

                # Why did it take until 2.7 to get a flags argument on re.sub?
                motion = re.compile(r'(\w+)( Extraordinary)? Session$',
                                    re.IGNORECASE).sub('', motion)
                motion = re.compile(r'^(Senate|Assembly) ',
                                    re.IGNORECASE).sub('', motion)
                motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ',
                                '', motion)
                motion = re.sub(r' \(\w+\)$', '', motion)
                motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$',
                                '', motion)
                motion = re.sub(r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? '
                                r'Urgency Clause$',
                                '(Urgency Clause)', motion)
                motion = re.sub(r'\s+', ' ', motion)

                if not motion:
                    self.warning("Got blank motion on vote for %s" % bill_id)
                    continue

                fsvote = Vote(vote_chamber,
                              self._tz.localize(vote.vote_date_time),
                              motion,
                              result,
                              int(vote.ayes),
                              int(vote.noes),
                              int(vote.abstain),
                              threshold=vote.threshold,
                              type=vtype)

                if vote_location != 'Floor':
                    fsvote['committee'] = vote_location

                for record in vote.votes:
                    if record.vote_code == 'AYE':
                        fsvote.yes(record.legislator_name)
                    elif record.vote_code.startswith('NO'):
                        fsvote.no(record.legislator_name)
                    else:
                        fsvote.other(record.legislator_name)

                # The abstain count field in CA's database includes
                # vacancies, which we aren't interested in.
                fsvote['other_count'] = len(fsvote['other_votes'])

                fsbill.add_vote(fsvote)

            self.save_bill(fsbill)
Esempio n. 33
0
    def scrape(self, session, chambers):
        sid = self.metadata['session_details'][session]['_guid']
        legislation = backoff(
            self.lservice.GetLegislationForSession,
            sid
        )['LegislationIndex']
        for leg in legislation:
            lid = leg['Id']
            instrument = backoff(self.lservice.GetLegislationDetail, lid)
            history = [x for x in instrument['StatusHistory'][0]]
            actions = reversed([{
                "code": x['Code'],
                "action": x['Description'],
                "_guid": x['Id'],
                "date": x['Date']
            } for x in history])

            guid = instrument['Id']

            bill_type = instrument['DocumentType']
            chamber = {
                "H": "lower",
                "S": "upper",
                "J": "joint"
            }[bill_type[0]]  # XXX: This is a bit of a hack.

            bill_id = "%s %s" % (
                bill_type,
                instrument['Number'],
            )
            if instrument['Suffix']:
                bill_id += instrument['Suffix']

            title = instrument['Caption']
            description = instrument['Summary']

            if title is None:
                continue

            bill = Bill(
                session,
                chamber,
                bill_id,
                title,
                description=description,
                _guid=guid
            )

            if instrument['Votes']:
                for vote_ in instrument['Votes']:
                    _, vote_ = vote_
                    vote_ = backoff(self.vservice.GetVote, vote_[0]['VoteId'])

                    vote = Vote(
                        {"House": "lower", "Senate": "upper"}[vote_['Branch']],
                        vote_['Date'],
                        vote_['Caption'] or "Vote on Bill",
                        (vote_['Yeas'] > vote_['Nays']),
                        vote_['Yeas'],
                        vote_['Nays'],
                        (vote_['Excused'] + vote_['NotVoting']),
                        session=session,
                        bill_id=bill_id,
                        bill_chamber=chamber)

                    vote.add_source(self.vsource)

                    methods = {"Yea": vote.yes, "Nay": vote.no,}

                    for vdetail in vote_['Votes'][0]:
                        whom = vdetail['Member']
                        how = vdetail['MemberVoted']
                        try:
                            m = methods[how]
                        except KeyError:
                            m = vote.other
                        m(whom['Name'])

                    bill.add_vote(vote)


            types = {
                "HI": ["other"],
                "SI": ["other"],
                "HH": ["other"],
                "SH": ["other"],
                "HPF": ["bill:introduced"],
                "HDSAS": ["other"],
                "SPF": ["bill:introduced"],
                "HSR": ["bill:reading:2"],
                "SSR": ["bill:reading:2"],
                "HFR": ["bill:reading:1"],
                "SFR": ["bill:reading:1"],
                "HRECM": ["bill:withdrawn", "committee:referred"],
                "SRECM": ["bill:withdrawn", "committee:referred"],
                "SW&C": ["bill:withdrawn", "committee:referred"],
                "HW&C": ["bill:withdrawn", "committee:referred"],
                "HRA": ["bill:passed"],
                "SRA": ["bill:passed"],
                "HPA": ["bill:passed"],
                "HRECO": ["other"],
                "SPA": ["bill:passed"],
                "HTABL": ["other"],  # "House Tabled" - what is this?
                "SDHAS": ["other"],
                "HCFR": ["committee:passed:favorable"],
                "SCFR": ["committee:passed:favorable"],
                "HRAR": ["committee:referred"],
                "SRAR": ["committee:referred"],
                "STR": ["bill:reading:3"],
                "SAHAS": ["other"],
                "SE": ["bill:passed"],
                "SR": ["committee:referred"],
                "HTRL": ["bill:reading:3", "bill:failed"],
                "HTR": ["bill:reading:3"],
                "S3RLT": ["bill:reading:3", "bill:failed"],
                "HASAS": ["other"],
                "S3RPP": ["other"],
                "STAB": ["other"],
                "SRECO": ["other"],
                "SAPPT": ["other"],
                "HCA": ["other"],
                "HNOM": ["other"],
                "HTT": ["other"],
                "STT": ["other"],
                "SRECP": ["other"],
                "SCRA": ["other"],
                "SNOM": ["other"],
                "S2R": ["bill:reading:2"],
                "H2R": ["bill:reading:2"],
                "SENG": ["bill:passed"],
                "HENG": ["bill:passed"],
                "HPOST": ["other"],
                "HCAP": ["other"],
                "SDSG": ["governor:signed"],
                "SSG": ["governor:received"],
                "Signed Gov": ["governor:signed"],
                "HDSG": ["governor:signed"],
                "HSG": ["governor:received"],
                "EFF": ["other"],
                "HRP": ["other"],
                "STH": ["other"],
                "HTS": ["other"],
            }

            ccommittees = defaultdict(list)
            committees = instrument['Committees']
            if committees:
                for committee in committees[0]:
                    ccommittees[{
                        "House": "lower",
                        "Senate": "upper",
                    }[committee['Type']]].append(committee['Name'])

            for action in actions:
                chamber = {
                    "H": "lower",
                    "S": "upper",
                    "E": "other",  # Effective Date
                }[action['code'][0]]

                try:
                    _types = types[action['code']]
                except KeyError:
                    self.debug(action)
                    _types = ["other"]

                committees = []
                if any(('committee' in x for x in _types)):
                    committees = [str(x) for x in ccommittees.get(chamber, [])]

                bill.add_action(chamber, action['action'], action['date'], _types,
                                committees=committees,
                                _code=action['code'],
                                _code_id=action['_guid'])

            sponsors = []
            if instrument['Authors']:
                sponsors = instrument['Authors']['Sponsorship']
                if 'Sponsors' in instrument and instrument['Sponsors']:
                    sponsors += instrument['Sponsors']['Sponsorship']

            sponsors = [
                (x['Type'], self.get_member(x['MemberId'])) for x in sponsors
            ]

            for typ, sponsor in sponsors:
                name = "{First} {Last}".format(**dict(sponsor['Name']))
                bill.add_sponsor(
                    'primary' if 'Author' in typ else 'seconday',
                     name
                )

            for version in instrument['Versions']['DocumentDescription']:
                name, url, doc_id, version_id = [
                    version[x] for x in [
                        'Description',
                        'Url',
                        'Id',
                        'Version'
                    ]
                ]
                bill.add_version(
                    name,
                    url,
                    mimetype='application/pdf',
                    _internal_document_id=doc_id,
                    _version_id=version_id
                )

            versions = sorted(
                bill['versions'],
                key=lambda x: x['_internal_document_id']
            )
            bill['versions'] = versions

            bill.add_source(self.msource)
            bill.add_source(self.lsource)
            bill.add_source(SOURCE_URL.format(**{
                "session": session,
                "bid": guid,
            }))
            self.save_bill(bill)
Esempio n. 34
0
    def scrape_bill(self,link,chamber,session):
        legislation_types = {
            'House Bill': 'HB',
            'House Concurrent Resolution': 'HCR',
            'House Joint Resolution': 'HJR',
            'House Resolution': 'HR',
            'Senate Bill': 'SB',
            'Senate Concurrent Resolution': 'SCR',
            'Senate Joint Resolution': 'SJR',
            'Senate Resolution': 'SR',
        }

        base_url = "http://legis.delaware.gov"
        text_base_url = "http://legis.delaware.gov/LIS/lis{session}.nsf/vwLegislation/{bill_id}/$file/legis.html?open"
        try:
            page = self.lxmlize(link, True)
        except requests.exceptions.HTTPError:
            self.logger.warning('404. Apparently the bill hasn\'t been posted')
            return
        nominee = self.get_node(page, './/div[@id="page_header"]/text()')
        if nominee is not None and nominee.strip().lower() == "nominee information":
            self.logger.info("Nominee, skipping")
            return

        bill_id = self.get_node(page, './/div[@align="center" or @style="text-align:center"]')
        try:
            bill_id = bill_id.text_content().strip()
        except IndexError:
            self.logger.warning("Can't find bill number, skipping")
            return

        #some bill_ids include relevant amendments
        #in the form "SB 10 w/SA1", so we fix it here
        bill_id = bill_id.split("w/")[0]
        bill_id = bill_id.split("(")[0]
        
        leg_type = None
        for long_name, short_name in legislation_types.items():
            if long_name in bill_id:
                leg_type = short_name
                bill_num = bill_id.replace(long_name,"").strip()
                break
        if leg_type:
            bill_id = leg_type + " " + bill_num
        elif "for" in bill_id:
            bill_id = bill_id.split("for")[1]
        else:
            self.logger.warning("Unknown bill type for {}".format(bill_id))
            return

        bill_id = bill_id.replace('&nbsp',"")
        bill_id = bill_id.strip()

        #each row is in its own table
        #there are no classes/ids or anything, so we're going to loop
        #through the individual tables and look for keywords
        #in the first td to tell us what we're looking at
        tables = self.get_nodes(page, './/div[@id="page_content"]/table')

        bill_title = None
        primary_sponsors = []
        cosponsors = []
        bill_url = None
        bill_documents = {}
        action_list = []
        vote_documents = {}
        sub_link = None
        bill_text_avail = False

        if tables is None or not tables:
            self.logger.warning('First xpath didn\'t work.')
            tables = self.get_nodes(page, './/table[@style="width:837.0px"]/tr')

        for table in tables:
            tds = table.xpath('.//td')
            if len(tds) == 0:
                #some kind of empty table for formatting reasons
                continue
            title_text = tds[0].text_content().strip().lower()

            if title_text.startswith('primary sponsor'):
                pri_sponsor_text = tds[1].text_content()
                primary_sponsors = self.separate_names(pri_sponsor_text)
                #sometimes additional sponsors are in a 3rd td
                #other times the 3rd td contains a blank image
                addl_sponsors = []
                add_spons_text = tds[2].text_content().strip()
                if add_spons_text:
                    add_spons_text = add_spons_text.replace("Additional Sponsor(s):","")
                    if not "on behalf of all representatives" in add_spons_text.lower():
                        addl_sponsors = self.separate_names(add_spons_text)

            elif title_text.startswith('co-sponsor'):
                cosponsor_text = tds[1].text_content()
                if "none..." in cosponsor_text.lower():
                    cosponsors = []
                    continue
                cosponsors = self.separate_names(cosponsor_text)

            elif title_text.startswith('long title'):
                bill_title = tds[1].text_content().strip()

            elif title_text.startswith('amendment'):
                amendments = tds[1].xpath('.//a')
                for a in amendments:
                    amm = a.text
                    amm_text = "Amendment".format(amm.strip())
                    amm_slg = "+".join(amm.split())
                    amm_link = text_base_url.format(session=session,
                                                bill_id=amm_slg)
                    bill_documents[amm_text] = amm_link
                    amm_page = self.lxmlize(a.attrib["href"])
                    for tr in amm_page.xpath('//tr'):
                        tds = tr.xpath("./td")
                        if len(tds) > 1:
                            if "voting" in tds[0].text_content().lower():
                                self.find_vote(tds,vote_documents,"Amendment: ")

            elif title_text.startswith('engrossed version'):
                if tds[1].text_content().strip():
                    engrossment_base = "http://legis.delaware.gov/LIS/lis{session}.nsf/EngrossmentsforLookup/{bill_id}/$file/Engross.html?open"
                    engrossment_link = engrossment_base.format(session=session,
                                        bill_id = "+".join(bill_id.split()))
                    if bill_url not in bill_documents.values():
                        bill_documents["Engrossed Version"] = engrossment_link

            elif title_text.startswith('substituted'):
                content = tds[1].text_content().strip()
                if ("Substitute" in content and
                    not "Original" in content):
                    sub_link = tds[1].xpath(".//a/@href")[0]

            elif ("full text" in title_text
                and ("(" not in title_text
                or "html" in title_text)):
                    if tds[1].text_content().strip():
                        #it is totally unclear which version of the bill is referred to here
                        #so I'm just calling it "bill text"
                        bill_url = text_base_url.format(
                                        session=session,
                                        bill_id=bill_id.replace(" ","+"))
                        if bill_url not in bill_documents.values():
                            bill_documents["Bill Text"] = bill_url

            elif title_text.startswith('fiscal notes'):
                pass
                #skipping fiscal notes for now, they are really ugly
                #but leaving in as a placeholder so we can remember to
                #do this someday, if we feel like it

            elif title_text.startswith('committee reports'):
                pass
                #the committee reports let a legislator
                #comment on a bill. They can comment as
                #"favorable","unfavorable" or "on its merits"
                #but these are NOT votes (per conversation w
                #seceretary of the DE senate 3/16/15). The bill is
                #considered if the majority sign it, which will
                #appear in the bill's action history as being
                #reported out of committee

            elif title_text.startswith('voting'):
                self.find_vote(tds,vote_documents)
            elif title_text.startswith('actions history'):
                action_list = tds[1].text_content().split("\n")

        sub_versions = []
        use_sub = False
        if sub_link:
            bill = self.scrape_bill(sub_link,chamber,session)
            if bill:
                sub_versions = [v["url"] for v in bill["versions"]]
                bill.add_title(bill_id)
                use_sub = True

        if not use_sub:
            bill = Bill(session,chamber,bill_id,bill_title)

            for s in primary_sponsors:
                bill.add_sponsor("primary",s)

            for s in addl_sponsors:
                #it is not totally clear whether "additional sponsors"
                #are co or primary but primary is my best guess
                #based on the bill text, bc they're on the first
                #line with the primary sponsor
                bill.add_sponsor("primary",s)

            for s in cosponsors:
                bill.add_sponsor("cosponsor",s)

        for name, doc_link in bill_documents.items():
            if "Engrossment" in name or "Bill Text" in name:
                if doc_link not in sub_versions:
                    bill.add_version(name,doc_link,mimetype="text/html")
            else:
                pass
                bill.add_document(name,doc_link,mimetype="text/html")

        for a in action_list:
            if a.strip():
                date, action = a.split('-', 1)
                try:
                    date = datetime.strptime(date.strip(), '%b %d, %Y')
                except ValueError:
                    date = datetime.strptime(date.strip(), '%B %d, %Y') # XXX: ugh.
                action = action.strip()
                actor = actions.get_actor(action, bill['chamber'])
                attrs = dict(actor=actor, action=action, date=date)
                attrs.update(**self.categorizer.categorize(action))
                attrs["action"] = " ".join(attrs["action"].split())
                bill.add_action(**attrs)

        for name, doc in vote_documents.items():
            vote_chamber = "lower" if "house" in name.lower() else "upper"
            try:
                self.head(doc)
            except requests.exceptions.HTTPError:
                self.logger.warning("could not access vote document")
                continue

            vote_page = self.lxmlize(doc)

            try:
                vote_info = vote_page.xpath('.//div[@id="page_content"]/p')[-1]
                vote_tds = vote_page.xpath(".//table//td")
            except IndexError:
                vote_info = vote_page.xpath('.//form[1]')[0]
                vote_tds = vote_page.xpath('.//table[@border="0"]//td')

            yes_votes = []
            no_votes = []
            other_votes = []

            lines = vote_info.text_content().split("\n")
            lines = filter(None, lines)

            for line in lines:
                if line.strip().startswith("Date"):
                    date_str = " ".join(line.split()[1:4])
                    date = datetime.strptime(date_str,"%m/%d/%Y %I:%M %p")
                    passage_status = line.strip().split()[-1]

                    #we've never seen a vote with anything but "passed"
                    #so throw an error otherwise so we can figure it out
                    passed_statuses = ["Passed"]
                    failed_statuses = ["Defeated", "Rescinded"]
                    if passage_status not in passed_statuses+failed_statuses:
                        raise AssertionError("Unknown passage state {}".format(passage_status))
                    passed = passage_status in passed_statuses

                if line.strip().startswith("Vote Type"):
                    if "voice" in line.lower():
                        voice_vote = True
                    else:
                        voice_vote = False
                        yes_count = int(re.findall("Yes: (\d+)",line)[0])
                        no_count = int(re.findall("No: (\d+)",line)[0])
                        other_count = int(re.findall("Not Voting: (\d+)",line)[0])
                        other_count += int(re.findall("Absent: (\d+)",line)[0])

                        person_seen = False

                        for td in vote_tds:
                            if person_seen:
                                person_vote = td.text_content().strip()
                                if person_vote == "Y":
                                    yes_votes.append(person)
                                elif person_vote == "N":
                                    no_votes.append(person)
                                elif person_vote in ["NV","A","X","C"]:
                                    other_votes.append(person)
                                else:
                                    raise AssertionError("Unknown vote '{}'".format(person_vote))
                                person_seen = False
                            else:
                                person = td.text_content().strip()
                                if person:
                                    person_seen = True

            if voice_vote:
                vote = Vote(vote_chamber,date,"passage",passed,0,0,0)
            else:
                vote = Vote(vote_chamber,date,"passage",
                            passed,yes_count,no_count,other_count,
                            yes_votes=[],
                            no_votes=[],
                            other_votes=[])

                vote["yes_votes"] = yes_votes
                vote["no_votes"] = no_votes
                vote["other_votes"] = other_votes

            if (passed
                and vote["yes_count"] <= vote["no_count"]
                and not voice_vote):
                    raise AssertionError("Vote passed with more N than Y votes?")

            if not passed and vote["yes_count"] > vote["no_count"]:
                self.logger.warning("Vote did not pass but had a majority \
                        probably worth checking")

            if "Amendment" in name:
                vote["type"] = "amendment"
            else:
                vote["type"] = "passage"
            vote.add_source(doc)
            bill.add_vote(vote)

        bill.add_source(link)

        return bill
Esempio n. 35
0
    def scrape_bill(self, chamber, session, bill_id, short_title=None):
        """
        Scrapes documents, actions, vote counts and votes for
        bills from the 2009 session and above.
        """
        url = BILL_URL % (session, bill_id.replace(' ', ''))
        with self.urlopen(url) as bill_page:
            html = lxml.html.fromstring(bill_page)
            html.make_links_absolute(
                'http://legislature.idaho.gov/legislation/%s/' % session)
            bill_tables = html.xpath('./body/table/tr/td[2]')[0].xpath(
                './/table')
            title = bill_tables[1].text_content().strip()
            bill_type = get_bill_type(bill_id)
            bill = Bill(session, chamber, bill_id, title, type=bill_type)
            bill.add_source(url)
            bill['subjects'] = self._subjects[bill_id.replace(' ', '')]

            if short_title and bill['title'].lower() != short_title.lower():
                bill.add_title(short_title)

            # documents
            doc_links = html.xpath('//span/a')
            for link in doc_links:
                name = link.text_content().strip()
                href = link.get('href')
                if 'Engrossment' in name or 'Bill Text' in name:
                    bill.add_version(name, href)
                else:
                    bill.add_document(name, href)

            # sponsors range from a committee to one legislator to a group of legs
            sponsor_lists = bill_tables[0].text_content().split('by')
            if len(sponsor_lists) > 1:
                for sponsors in sponsor_lists[1:]:
                    for person in sponsors.split(','):
                        bill.add_sponsor('primary', person)

            actor = chamber
            last_date = None
            for row in bill_tables[2]:
                # lots of empty rows
                if len(row) == 1:
                    continue
                _, date, action, _ = [x.text_content().strip() for x in row]

                if date:
                    last_date = date
                else:
                    date = last_date

                date = datetime.datetime.strptime(date + '/' + session[0:4],
                                                  "%m/%d/%Y")
                if action.startswith('House'):
                    actor = 'lower'
                elif action.startswith('Senate'):
                    actor = 'upper'

                # votes
                if 'AYES' in action or 'NAYS' in action:
                    vote = self.parse_vote(actor, date, row[2])
                    vote.add_source(url)
                    bill.add_vote(vote)
                # some td's text is seperated by br elements
                if len(row[2]):
                    action = "".join(row[2].itertext())
                action = action.replace(u'\xa0', ' ').strip()
                atype = get_action(actor, action)
                bill.add_action(actor, action, date, type=atype)
                # after voice vote/roll call and some actions the bill is sent
                # 'to House' or 'to Senate'
                if 'to House' in action:
                    actor = 'lower'
                elif 'to Senate' in action:
                    actor = 'upper'
            self.save_bill(bill)
Esempio n. 36
0
    def scrape_bill_sheet(self, session, chamber):
        """
        Scrape the bill sheet (the page full of bills and other small bits of data)
        """
        sheet_url = self.get_bill_folder(session, chamber)

        bill_chamber = {"Senate": "upper", "House": "lower"}[chamber]

        index = {
            "id": 0,
            "title_sponsor": 1,
            "version": 2,
            "history": 3,
            "votes": 7
        }

        sheet_html = self.urlopen(sheet_url)
        sheet_page = lxml.html.fromstring(sheet_html)
        sheet_page.make_links_absolute(sheet_url)

        bills = sheet_page.xpath('//table/tr')

        for bill in bills:
            bill_id = self.read_td(bill[index["id"]][0])

            if bill_id == None:
                # Every other entry is null for some reason
                continue

            dot_loc = bill_id.find('.')
            if dot_loc != -1:
                # budget bills are missing the .pdf, don't truncate
                bill_id = bill_id[:dot_loc]
            title_and_sponsor = bill[index["title_sponsor"]][0]

            bill_title = title_and_sponsor.text
            bill_title_and_sponsor = title_and_sponsor.text_content()
            if bill_title is None:
                continue  # Odd ...

            sponsors = bill_title_and_sponsor.replace(bill_title, "").\
                replace(" & ...", "").split("--")

            cats = {
                "SB": "bill",
                "HB": "bill",
                "HR": "resolution",
                "SR": "resolution",
                "SCR": "concurrent resolution",
                "HCR": "concurrent resolution",
                "SJR": "joint resolution",
                "HJR": "joint resolution",
                "SM": "memorial",
                "HM": "memorial"
            }

            bill_type = None

            for cat in cats:
                if bill_id[:len(cat)] == cat:
                    bill_type = cats[cat]

            b = Bill(session,
                     bill_chamber,
                     bill_id,
                     bill_title,
                     type=bill_type)

            b.add_source(sheet_url)

            versions_url = \
                bill[index["version"]].xpath('font/a')[0].attrib["href"]
            versions_url = versions_url
            versions = self.parse_versions(versions_url)

            for version in versions:
                b.add_version(version['name'],
                              version['link'],
                              mimetype=version['mimetype'])

            bill_history_href = bill[index["history"]][0][0].attrib['href']

            history = self.parse_history(bill_history_href)
            b.add_source(bill_history_href)

            chamber_map = dict(Senate='upper', House='lower')
            for action, date in history:
                action_actor = chamber_map.get(chamber, chamber)
                attrs = dict(actor=action_actor, action=action, date=date)
                attrs.update(self.categorizer.categorize(action))
                b.add_action(**attrs)

            for sponsor in sponsors:
                if sponsor != None and sponsor != "(NONE)" and \
                   sponsor != "":
                    if "&" in sponsor:
                        for sponsor in [x.strip() for x in sponsor.split("&")]:
                            b.add_sponsor("primary", sponsor)
                    else:
                        b.add_sponsor("primary", sponsor)

            # Now that we have history, let's see if we can't grab some
            # votes

            bill_vote_href, = bill.xpath(".//a[contains(text(), 'Votes')]")
            bill_vote_href = bill_vote_href.attrib['href']
            #bill_vote_href = self.get_vote_url(bill_id, session)
            votes = self.parse_votes(bill_vote_href)

            if (votes['sanity-check'] == 'This site only supports frames '
                    'compatible browsers!'):
                votes['votes'] = []
            elif votes['sanity-check'] != bill_id:
                self.warning("XXX: READ ME! Sanity check failed!")
                self.warning(" -> Scraped ID: " + votes['sanity-check'])
                self.warning(" -> 'Real' ID:  " + bill_id)
                assert votes['sanity-check'] == bill_id

            for vote in votes['votes']:
                filed_votes = vote['votes']
                passage = vote['meta']
                result = vote['result']

                composite_time = "%s %s" % (passage['x-parent-date'],
                                            passage['TIME'])
                # It's now like: 04/01/2011 02:10:14 PM
                pydate = dt.datetime.strptime(composite_time,
                                              "%m/%d/%Y %I:%M:%S %p")
                hasHouse = "House" in passage['x-parent-ctty']
                hasSenate = "Senate" in passage['x-parent-ctty']

                if hasHouse and hasSenate:
                    actor = "joint"
                elif hasHouse:
                    actor = "lower"
                else:
                    actor = "upper"

                other = (int(result['EXC']) + int(result['ABS']))
                # OK, sometimes the Other count is wrong.
                local_other = 0
                for voter in filed_votes:
                    l_vote = filed_votes[voter].lower().strip()
                    if l_vote != "yes" and l_vote != "no":
                        local_other = local_other + 1

                if local_other != other:
                    self.warning( \
                        "XXX: !!!WARNING!!! - resetting the 'OTHER' VOTES")
                    self.warning(" -> Old: %s // New: %s" %
                                 (other, local_other))
                    other = local_other

                passed = (result['FINAL_ACTION'] == "PASS")
                if passage['MOTION'].strip() == "":
                    continue

                if "without objection" in passage['MOTION'].lower():
                    passed = True

                v = Vote(actor,
                         pydate,
                         passage['MOTION'],
                         passed,
                         int(result['YES']),
                         int(result['NO']),
                         other,
                         moved=passage['MOVED'],
                         seconded=passage['SECONDED'])

                v.add_source(vote['meta']['url'])
                # v.add_source( bill_vote_href )

                # XXX: Add more stuff to kwargs, we have a ton of data
                seen = set([])
                for voter in filed_votes:
                    who = voter
                    if who in seen:
                        raise Exception("Seeing the double-thing. - bug #702")
                    seen.add(who)

                    vote = filed_votes[who]
                    if vote.lower() == "yes":
                        v.yes(who)
                    elif vote.lower() == "no":
                        v.no(who)
                    else:
                        v.other(who)
                b.add_vote(v)
            self.save_bill(b)
Esempio n. 37
0
    def scrape(self, session, chambers):
        sid = self.metadata["session_details"][session]["_guid"]
        legislation = backoff(self.lservice.GetLegislationForSession, sid)["LegislationIndex"]
        for leg in legislation:
            lid = leg["Id"]
            instrument = backoff(self.lservice.GetLegislationDetail, lid)
            history = [x for x in instrument["StatusHistory"][0]]
            actions = reversed(
                [{"code": x["Code"], "action": x["Description"], "_guid": x["Id"], "date": x["Date"]} for x in history]
            )

            guid = instrument["Id"]

            bill_type = instrument["DocumentType"]
            chamber = {"H": "lower", "S": "upper", "J": "joint"}[bill_type[0]]  # XXX: This is a bit of a hack.

            bill_id = "%s %s" % (bill_type, instrument["Number"])
            if instrument["Suffix"]:
                bill_id += instrument["Suffix"]

            title = instrument["Caption"]
            description = instrument["Summary"]

            if title is None:
                continue

            bill = Bill(session, chamber, bill_id, title, description=description, _guid=guid)

            if instrument["Votes"]:
                for vote_ in instrument["Votes"]:
                    _, vote_ = vote_
                    vote_ = backoff(self.vservice.GetVote, vote_[0]["VoteId"])

                    vote = Vote(
                        {"House": "lower", "Senate": "upper"}[vote_["Branch"]],
                        vote_["Date"],
                        vote_["Caption"] or "Vote on Bill",
                        (vote_["Yeas"] > vote_["Nays"]),
                        vote_["Yeas"],
                        vote_["Nays"],
                        (vote_["Excused"] + vote_["NotVoting"]),
                        session=session,
                        bill_id=bill_id,
                        bill_chamber=chamber,
                    )

                    vote.add_source(self.vsource)

                    methods = {"Yea": vote.yes, "Nay": vote.no}

                    for vdetail in vote_["Votes"][0]:
                        whom = vdetail["Member"]
                        how = vdetail["MemberVoted"]
                        try:
                            m = methods[how]
                        except KeyError:
                            m = vote.other
                        m(whom["Name"])

                    bill.add_vote(vote)

            types = {
                "HI": ["other"],
                "SI": ["other"],
                "HH": ["other"],
                "SH": ["other"],
                "HPF": ["bill:introduced"],
                "HDSAS": ["other"],
                "SPF": ["bill:introduced"],
                "HSR": ["bill:reading:2"],
                "SSR": ["bill:reading:2"],
                "HFR": ["bill:reading:1"],
                "SFR": ["bill:reading:1"],
                "HRECM": ["bill:withdrawn", "committee:referred"],
                "SRECM": ["bill:withdrawn", "committee:referred"],
                "SW&C": ["bill:withdrawn", "committee:referred"],
                "HW&C": ["bill:withdrawn", "committee:referred"],
                "HRA": ["bill:passed"],
                "SRA": ["bill:passed"],
                "HPA": ["bill:passed"],
                "HRECO": ["other"],
                "SPA": ["bill:passed"],
                "HTABL": ["other"],  # "House Tabled" - what is this?
                "SDHAS": ["other"],
                "HCFR": ["committee:passed:favorable"],
                "SCFR": ["committee:passed:favorable"],
                "HRAR": ["committee:referred"],
                "SRAR": ["committee:referred"],
                "STR": ["bill:reading:3"],
                "SAHAS": ["other"],
                "SE": ["bill:passed"],
                "SR": ["committee:referred"],
                "HTRL": ["bill:reading:3", "bill:failed"],
                "HTR": ["bill:reading:3"],
                "S3RLT": ["bill:reading:3", "bill:failed"],
                "HASAS": ["other"],
                "S3RPP": ["other"],
                "STAB": ["other"],
                "SRECO": ["other"],
                "SAPPT": ["other"],
                "HCA": ["other"],
                "HNOM": ["other"],
                "HTT": ["other"],
                "STT": ["other"],
                "SRECP": ["other"],
                "SCRA": ["other"],
                "SNOM": ["other"],
                "S2R": ["bill:reading:2"],
                "H2R": ["bill:reading:2"],
                "SENG": ["bill:passed"],
                "HENG": ["bill:passed"],
                "HPOST": ["other"],
                "HCAP": ["other"],
                "SDSG": ["governor:signed"],
                "SSG": ["governor:received"],
                "Signed Gov": ["governor:signed"],
                "HDSG": ["governor:signed"],
                "HSG": ["governor:received"],
                "EFF": ["other"],
                "HRP": ["other"],
                "STH": ["other"],
                "HTS": ["other"],
            }

            ccommittees = defaultdict(list)
            committees = instrument["Committees"]
            if committees:
                for committee in committees[0]:
                    ccommittees[{"House": "lower", "Senate": "upper"}[committee["Type"]]].append(committee["Name"])

            for action in actions:
                chamber = {"H": "lower", "S": "upper", "E": "other"}[action["code"][0]]  # Effective Date

                try:
                    _types = types[action["code"]]
                except KeyError:
                    self.debug(action)
                    _types = ["other"]

                committees = []
                if any(("committee" in x for x in _types)):
                    committees = [str(x) for x in ccommittees.get(chamber, [])]

                bill.add_action(
                    chamber,
                    action["action"],
                    action["date"],
                    _types,
                    committees=committees,
                    _code=action["code"],
                    _code_id=action["_guid"],
                )

            sponsors = []
            if instrument["Authors"]:
                sponsors = instrument["Authors"]["Sponsorship"]
                if "Sponsors" in instrument and instrument["Sponsors"]:
                    sponsors += instrument["Sponsors"]["Sponsorship"]

            sponsors = [(x["Type"], self.get_member(x["MemberId"])) for x in sponsors]

            for typ, sponsor in sponsors:
                name = "{First} {Last}".format(**dict(sponsor["Name"]))
                bill.add_sponsor("primary" if "Author" in typ else "seconday", name)

            for version in instrument["Versions"]["DocumentDescription"]:
                name, url, doc_id, version_id = [version[x] for x in ["Description", "Url", "Id", "Version"]]
                bill.add_version(
                    name, url, mimetype="application/pdf", _internal_document_id=doc_id, _version_id=version_id
                )

            bill.add_source(self.msource)
            bill.add_source(self.lsource)
            bill.add_source(SOURCE_URL.format(**{"session": session, "bid": guid}))
            self.save_bill(bill)
Esempio n. 38
0
    def scrape(self, chamber, session):
        for term in self.metadata['terms']:
            if term['sessions'][0] == session:
                year = str(term['start_year'])
                year2 = str(term['end_year'])
                break
        else:
            raise NoDataForPeriod(session)

        if chamber == 'upper':
            bill_abbr = 'SB|SCR|SJR'
        elif chamber == 'lower':
            bill_abbr = 'HB|HCR|HJR'

        # Full calendar year
        date1 = '0101' + year[2:]
        date2 = '1231' + year2[2:]

        # Get bill list
        bill_list_url = 'http://www.legis.state.ak.us/'\
            'basis/range_multi.asp?session=%s&date1=%s&date2=%s' % (
            session, date1, date2)
        self.log("Getting bill list for %s %s (this may take a long time)." %
                 (chamber, session))
        bill_list = self.soup_parser(self.urlopen(bill_list_url))

        # Find bill links
        re_str = "bill=%s\d+" % bill_abbr
        links = bill_list.findAll(href=re.compile(re_str))

        for link in links:
            bill_id = link.contents[0].replace(' ', '')
            bill_name = link.parent.parent.findNext('td').find(
                'font').contents[0].strip()

            if bill_id.startswith('HB') or bill_id.startswith('SB'):
                btype = ['bill']
            elif bill_id.startswith('SJR') or bill_id.startswith('HJR'):
                btype = ['joint resolution']
            elif bill_id.startswith('SR') or bill_id.startswith('HR'):
                btype = ['resolution']
            elif bill_id.startswith('SCR') or bill_id.startswith('HCR'):
                btype = ['concurrent resolution']

            if re.match(r'CONST\.? AM:', bill_name):
                btype.append('constitutional amendment')

            bill = Bill(session, chamber, bill_id, bill_name, type=btype)

            # Get the bill info page and strip malformed t
            info_url = "http://www.legis.state.ak.us/basis/%s" % link['href']
            info_page = self.soup_parser(self.urlopen(info_url))
            bill.add_source(info_url)

            # Get sponsors
            spons_str = info_page.find(
                text="SPONSOR(s):").parent.parent.contents[1]
            sponsors_match = re.match(
                ' (SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})',
                spons_str)
            if sponsors_match:
                sponsors = sponsors_match.group(2).split(',')
                sponsor = sponsors[0].strip()

                if sponsor:
                    bill.add_sponsor('primary', sponsors[0])

                for sponsor in sponsors[1:]:
                    sponsor = sponsor.strip()
                    if sponsor:
                        bill.add_sponsor('cosponsor', sponsor)
            else:
                # Committee sponsorship
                spons_str = spons_str.strip()

                if re.match(r' BY REQUEST OF THE GOVERNOR$', spons_str):
                    spons_str = re.sub(r' BY REQUEST OF THE GOVERNOR$', '',
                                       spons_str).title()
                    spons_str = (spons_str +
                                 " Committee (by request of the governor)")

                if spons_str:
                    bill.add_sponsor('committee', spons_str)

            # Get actions
            self._current_comm = None
            act_rows = info_page.findAll('table', 'myth')[1].findAll('tr')[1:]
            for row in act_rows:
                cols = row.findAll('td')
                act_date = cols[0].font.contents[0]
                act_date = dt.datetime.strptime(act_date, '%m/%d/%y')

                if cols[2].font.string == "(H)":
                    act_chamber = "lower"
                elif cols[2].font.string == "(S)":
                    act_chamber = "upper"
                else:
                    act_chamber = chamber

                action = cols[3].font.contents[0].strip()
                if re.match("\w+ Y(\d+) N(\d+)", action):
                    try:
                        vote = self.parse_vote(bill, action, act_chamber,
                                               act_date, cols[1].a['href'])
                        bill.add_vote(vote)
                    except:
                        self.log("Failed parsing vote")

                action, atype = self.clean_action(action)

                match = re.match('^Prefile released (\d+/\d+/\d+)$', action)
                if match:
                    action = 'Prefile released'
                    act_date = dt.datetime.strptime(match.group(1), '%m/%d/%y')

                bill.add_action(act_chamber, action, act_date, type=atype)

            # Get subjects
            bill['subjects'] = []
            subject_link_re = re.compile('.*subject=\w+$')
            for subject_link in info_page.findAll('a', href=subject_link_re):
                subject = subject_link.contents[0].strip()
                bill['subjects'].append(subject)

            # Get versions
            text_list_url = "http://www.legis.state.ak.us/"\
                "basis/get_fulltext.asp?session=%s&bill=%s" % (
                session, bill_id)
            text_list = self.soup_parser(self.urlopen(text_list_url))
            bill.add_source(text_list_url)

            text_link_re = re.compile('^get_bill_text?')
            for text_link in text_list.findAll('a', href=text_link_re):
                text_name = text_link.parent.previousSibling.contents[0]
                text_name = text_name.strip()

                text_url = "http://www.legis.state.ak.us/basis/%s" % (
                    text_link['href'])

                bill.add_version(text_name, text_url)

            self.save_bill(bill)
Esempio n. 39
0
    def scrape_bill_sheet(self, session, chamber):
        """
        Scrape the bill sheet (the page full of bills and other small bits of data)
        """
        sheet_url = self.get_bill_folder(session, chamber)

        bill_chamber = {"Senate": "upper", "House": "lower"}[chamber]

        index = {
            "id": 0,
            "title_sponsor": 1,
            "version": 2,
            "history": 3,
            "votes": 7
        }

        sheet_html = self.urlopen(sheet_url)
        sheet_page = lxml.html.fromstring(sheet_html)

        bills = sheet_page.xpath('//table/tr')

        for bill in bills:
            bill_id = self.read_td(bill[index["id"]][0])

            if bill_id == None:
                # Every other entry is null for some reason
                continue

            dot_loc = bill_id.find('.')
            if dot_loc != -1:
                # budget bills are missing the .pdf, don't truncate
                bill_id = bill_id[:dot_loc]
            title_and_sponsor = bill[index["title_sponsor"]][0]

            bill_title = title_and_sponsor.text
            bill_title_and_sponsor = title_and_sponsor.text_content()
            if bill_title is None:
                continue  # Odd ...

            sponsors = bill_title_and_sponsor.replace(bill_title, "").\
                replace(" & ...", "").split("--")

            cats = {
                "SB": "bill",
                "HB": "bill",
                "HR": "resolution",
                "SR": "resolution",
                "SCR": "concurrent resolution",
                "HCR": "concurrent resolution",
                "SJR": "joint resolution",
                "HJR": "joint resolution",
                "SM": "memorial",
                "HM": "memorial"
            }

            bill_type = None

            for cat in cats:
                if bill_id[:len(cat)] == cat:
                    bill_type = cats[cat]

            b = Bill(session, bill_chamber, bill_id, bill_title,
                     type=bill_type)

            b.add_source(sheet_url)

            versions_url = \
                bill[index["version"]].xpath('font/a')[0].attrib["href"]
            versions_url = CO_URL_BASE + versions_url
            versions = self.parse_versions(versions_url)

            for version in versions:
                b.add_version(version['name'], version['link'],
                    mimetype=version['mimetype'])

            bill_history_href = CO_URL_BASE + \
                bill[index["history"]][0][0].attrib['href']
                # ^^^^^^^ We assume this is a full path to the target.
                # might want to consider some better rel-path support
                # XXX: Look at this ^

            history = self.parse_history(bill_history_href)
            b.add_source(bill_history_href)

            chamber_map = dict(Senate='upper', House='lower')
            for action, date in history:
                action_actor = chamber_map.get(chamber, chamber)
                attrs = dict(actor=action_actor, action=action, date=date)
                attrs.update(self.categorizer.categorize(action))
                b.add_action(**attrs)

            for sponsor in sponsors:
                if sponsor != None and sponsor != "(NONE)" and \
                   sponsor != "":
                    b.add_sponsor("primary", sponsor)

            # Now that we have history, let's see if we can't grab some
            # votes

            bill_vote_href = self.get_vote_url(bill_id, session)
            votes = self.parse_votes(bill_vote_href)

            if votes['sanity-check'] != bill_id:
                self.warning("XXX: READ ME! Sanity check failed!")
                self.warning(" -> Scraped ID: " + votes['sanity-check'])
                self.warning(" -> 'Real' ID:  " + bill_id)
                assert votes['sanity-check'] == bill_id

            for vote in votes['votes']:
                filed_votes = vote['votes']
                passage = vote['meta']
                result = vote['result']

                composite_time = "%s %s" % (
                    passage['x-parent-date'],
                    passage['TIME']
                )
                # It's now like: 04/01/2011 02:10:14 PM
                pydate = dt.datetime.strptime(composite_time,
                    "%m/%d/%Y %I:%M:%S %p")
                hasHouse = "House" in passage['x-parent-ctty']
                hasSenate = "Senate" in passage['x-parent-ctty']

                if hasHouse and hasSenate:
                    actor = "joint"
                elif hasHouse:
                    actor = "lower"
                else:
                    actor = "upper"

                other = (int(result['EXC']) + int(result['ABS']))
                # OK, sometimes the Other count is wrong.
                local_other = 0
                for voter in filed_votes:
                    l_vote = filed_votes[voter].lower().strip()
                    if l_vote != "yes" and l_vote != "no":
                        local_other = local_other + 1

                if local_other != other:
                    self.warning( \
                        "XXX: !!!WARNING!!! - resetting the 'OTHER' VOTES")
                    self.warning(" -> Old: %s // New: %s" % (
                        other, local_other
                    ))
                    other = local_other

                v = Vote(actor, pydate, passage['MOTION'],
                    (result['FINAL_ACTION'] == "PASS"),
                    int(result['YES']), int(result['NO']),
                    other,
                    moved=passage['MOVED'],
                    seconded=passage['SECONDED'])

                v.add_source(vote['meta']['url'])
                # v.add_source( bill_vote_href )

                # XXX: Add more stuff to kwargs, we have a ton of data
                for voter in filed_votes:
                    who = voter
                    vote = filed_votes[who]
                    if vote.lower() == "yes":
                        v.yes(who)
                    elif vote.lower() == "no":
                        v.no(who)
                    else:
                        v.other(who)
                b.add_vote(v)
            self.save_bill(b)
Esempio n. 40
0
    def scrape_bills(self, chamber_to_scrape, session):
        url = 'http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml' % session

        bill_dir_page = self.get(url)
        root = lxml.etree.fromstring(bill_dir_page.content)
        for mr in root.xpath('//LASTACTION/MSRGROUP'):
            bill_id = mr.xpath('string(MEASURE)').replace(" ", "")
            if bill_id[0] == "S":
                chamber = "upper"
            else:
                chamber = "lower"

            bill_type = {'B':'bill', 'C': 'concurrent resolution',
                         'R': 'resolution', 'N': 'nomination'}[bill_id[1]]

            # just skip past bills that are of the wrong chamber
            if chamber != chamber_to_scrape:
                continue

            link = mr.xpath('string(ACTIONLINK)').replace("..", "")
            main_doc = mr.xpath('string(MEASURELINK)').replace("../../../", "")
            main_doc_url = 'http://billstatus.ls.state.ms.us/%s' % main_doc
            bill_details_url = 'http://billstatus.ls.state.ms.us/%s/pdf/%s' % (session, link)
            details_page = self.get(bill_details_url)

            page = details_page.content.replace(chr(11), "")
            # Some pages have the (invalid) byte 11 sitting around. Just drop
            # them out. Might as well.

            details_root = lxml.etree.fromstring(page)
            title = details_root.xpath('string(//SHORTTITLE)')
            longtitle = details_root.xpath('string(//LONGTITLE)')

            bill = Bill(session, chamber, bill_id, title,
                        type=bill_type, summary=longtitle)

            #sponsors
            main_sponsor = details_root.xpath('string(//P_NAME)').split()
            if main_sponsor:
                main_sponsor = main_sponsor[0]
                main_sponsor_link = details_root.xpath('string(//P_LINK)').replace(" ", "_")
                main_sponsor_url =  'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, main_sponsor_link)
                type = "primary"
                bill.add_sponsor(type, main_sponsor, main_sponsor_url = main_sponsor_url)
            for author in details_root.xpath('//AUTHORS/ADDITIONAL'):
                leg = author.xpath('string(CO_NAME)').replace(" ", "_")
                if leg:
                    leg_url = 'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, leg)
                    type = "cosponsor"
                    bill.add_sponsor(type, leg, leg_url=leg_url)

            #Versions 
            curr_version = details_root.xpath('string(//CURRENT_OTHER)').replace("../../../../", "")
            if curr_version != "":
                curr_version_url = "http://billstatus.ls.state.ms.us/" \
                        + curr_version
                bill.add_version("Current version", curr_version_url,
                                 on_duplicate='use_new',
                                 mimetype='text/html')

            intro_version = details_root.xpath('string(//INTRO_OTHER)').replace("../../../../", "")
            if intro_version != "":
                intro_version_url = "http://billstatus.ls.state.ms.us/"\
                        + intro_version
                bill.add_version("As Introduced", intro_version_url,
                                 on_duplicate='use_new',
                                 mimetype='text/html')

            comm_version = details_root.xpath('string(//CMTESUB_OTHER)').replace("../../../../", "")
            if comm_version.find("documents") != -1:
                comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version
                bill.add_version("Committee Substitute", comm_version_url,
                                 on_duplicate='use_new',
                                 mimetype='text/html')
            passed_version = details_root.xpath('string(//PASSED_OTHER)').replace("../../../../", "")
            if passed_version.find("documents") != -1:
                passed_version_url = "http://billstatus.ls.state.ms.us/" + passed_version
                title = "As Passed the " + chamber
                bill.add_version(title, passed_version_url,
                                 on_duplicate='use_new',
                                 mimetype='text/html')

            asg_version = details_root.xpath('string(//ASG_OTHER)').replace("../../../../", "")
            if asg_version.find("documents") != -1:
                asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version
                bill.add_version("Approved by the Governor", asg_version_url,
                                 on_duplicate='use_new',
                                 mimetype='text/html')


            # avoid duplicate votes
            seen_votes = set()

            #Actions
            for action in details_root.xpath('//HISTORY/ACTION'):
                action_num  = action.xpath('string(ACT_NUMBER)').strip()
                action_num = int(action_num)
                act_vote = action.xpath('string(ACT_VOTE)').replace("../../../..", "")
                action_desc = action.xpath('string(ACT_DESC)')
                date, action_desc = action_desc.split(" ", 1)
                date = date + "/" + session[0:4]
                date = datetime.strptime(date, "%m/%d/%Y")

                if action_desc.startswith("(H)"):
                    actor = "lower"
                    action = action_desc[4:]
                elif action_desc.startswith("(S)"):
                    actor = "upper"
                    action = action_desc[4:]
                else:
                    actor = "executive"
                    action = action_desc

                if action.find("Veto") != -1:
                    version_path = details_root.xpath("string(//VETO_OTHER)")
                    version_path = version_path.replace("../../../../", "")
                    version_url = "http://billstatus.ls.state.ms.us/" + version_path
                    bill.add_document("Veto", version_url) 

                atype = 'other'
                for prefix, prefix_type in self._action_types:
                    if action.startswith(prefix):
                        atype = prefix_type
                        break

                bill.add_action(actor, action, date, type=atype,
                                action_num=action_num)

                # use committee names as scraped subjects
                subjects = details_root.xpath('//H_NAME/text()')
                subjects += details_root.xpath('//S_NAME/text()')
                bill['subjects'] = subjects

                if act_vote:
                    vote_url = 'http://billstatus.ls.state.ms.us%s' % act_vote
                    if vote_url not in seen_votes:
                        seen_votes.add(vote_url)
                        vote = self.scrape_votes(vote_url, action,
                                                 date, actor)
                        vote.add_source(vote_url)
                        bill.add_vote(vote)

            bill.add_source(bill_details_url)
            self.save_bill(bill)
Esempio n. 41
0
    def scrape_bill(self, link, chamber, session):
        legislation_types = {
            'House Bill': 'HB',
            'House Concurrent Resolution': 'HCR',
            'House Joint Resolution': 'HJR',
            'House Resolution': 'HR',
            'Senate Bill': 'SB',
            'Senate Concurrent Resolution': 'SCR',
            'Senate Joint Resolution': 'SJR',
            'Senate Resolution': 'SR',
        }

        base_url = "http://legis.delaware.gov"
        text_base_url = "http://legis.delaware.gov/LIS/lis{session}.nsf/vwLegislation/{bill_id}/$file/legis.html?open"
        try:
            page = self.lxmlize(link, True)
        except requests.exceptions.HTTPError:
            self.logger.warning('404. Apparently the bill hasn\'t been posted')
            return
        nominee = self.get_node(page, './/div[@id="page_header"]/text()')
        if nominee is not None and nominee.strip().lower(
        ) == "nominee information":
            self.logger.info("Nominee, skipping")
            return

        bill_id = self.get_node(
            page, './/div[@align="center" or @style="text-align:center"]')
        try:
            bill_id = bill_id.text_content().strip()
        except IndexError:
            self.logger.warning("Can't find bill number, skipping")
            return

        #some bill_ids include relevant amendments
        #in the form "SB 10 w/SA1", so we fix it here
        bill_id = bill_id.split("w/")[0]
        bill_id = bill_id.split("(")[0]

        leg_type = None
        for long_name, short_name in legislation_types.items():
            if long_name in bill_id:
                leg_type = short_name
                bill_num = bill_id.replace(long_name, "").strip()
                break
        if leg_type:
            bill_id = leg_type + " " + bill_num
        elif "for" in bill_id:
            bill_id = bill_id.split("for")[1]
        else:
            self.logger.warning("Unknown bill type for {}".format(bill_id))
            return

        bill_id = bill_id.replace('&nbsp', "")
        bill_id = bill_id.strip()

        #each row is in its own table
        #there are no classes/ids or anything, so we're going to loop
        #through the individual tables and look for keywords
        #in the first td to tell us what we're looking at
        tables = self.get_nodes(page, './/div[@id="page_content"]/table')

        bill_title = None
        primary_sponsors = []
        cosponsors = []
        bill_url = None
        bill_documents = {}
        action_list = []
        vote_documents = {}
        sub_link = None
        bill_text_avail = False

        if tables is None or not tables:
            self.logger.warning('First xpath didn\'t work.')
            tables = self.get_nodes(page,
                                    './/table[@style="width:837.0px"]/tr')

        for table in tables:
            tds = table.xpath('.//td')
            if len(tds) == 0:
                #some kind of empty table for formatting reasons
                continue
            title_text = tds[0].text_content().strip().lower()

            if title_text.startswith('primary sponsor'):
                pri_sponsor_text = tds[1].text_content()
                primary_sponsors = self.separate_names(pri_sponsor_text)
                #sometimes additional sponsors are in a 3rd td
                #other times the 3rd td contains a blank image
                addl_sponsors = []
                add_spons_text = tds[2].text_content().strip()
                if add_spons_text:
                    add_spons_text = add_spons_text.replace(
                        "Additional Sponsor(s):", "")
                    if not "on behalf of all representatives" in add_spons_text.lower(
                    ):
                        addl_sponsors = self.separate_names(add_spons_text)

            elif title_text.startswith('co-sponsor'):
                cosponsor_text = tds[1].text_content()
                if "none..." in cosponsor_text.lower():
                    cosponsors = []
                    continue
                cosponsors = self.separate_names(cosponsor_text)

            elif title_text.startswith('long title'):
                bill_title = tds[1].text_content().strip()

            elif title_text.startswith('amendment'):
                amendments = tds[1].xpath('.//a')
                for a in amendments:
                    amm = a.text
                    amm_text = "Amendment".format(amm.strip())
                    amm_slg = "+".join(amm.split())
                    amm_link = text_base_url.format(session=session,
                                                    bill_id=amm_slg)
                    bill_documents[amm_text] = amm_link
                    amm_page = self.lxmlize(a.attrib["href"])
                    for tr in amm_page.xpath('//tr'):
                        tds = tr.xpath("./td")
                        if len(tds) > 1:
                            if "voting" in tds[0].text_content().lower():
                                self.find_vote(tds, vote_documents,
                                               "Amendment: ")

            elif title_text.startswith('engrossed version'):
                if tds[1].text_content().strip():
                    engrossment_base = "http://legis.delaware.gov/LIS/lis{session}.nsf/EngrossmentsforLookup/{bill_id}/$file/Engross.html?open"
                    engrossment_link = engrossment_base.format(
                        session=session, bill_id="+".join(bill_id.split()))
                    if bill_url not in bill_documents.values():
                        bill_documents["Engrossed Version"] = engrossment_link

            elif title_text.startswith('substituted'):
                content = tds[1].text_content().strip()
                if ("Substitute" in content and not "Original" in content):
                    sub_link = tds[1].xpath(".//a/@href")[0]

            elif ("full text" in title_text
                  and ("(" not in title_text or "html" in title_text)):
                if tds[1].text_content().strip():
                    #it is totally unclear which version of the bill is referred to here
                    #so I'm just calling it "bill text"
                    bill_url = text_base_url.format(session=session,
                                                    bill_id=bill_id.replace(
                                                        " ", "+"))
                    if bill_url not in bill_documents.values():
                        bill_documents["Bill Text"] = bill_url

            elif title_text.startswith('fiscal notes'):
                pass
                #skipping fiscal notes for now, they are really ugly
                #but leaving in as a placeholder so we can remember to
                #do this someday, if we feel like it

            elif title_text.startswith('committee reports'):
                pass
                #the committee reports let a legislator
                #comment on a bill. They can comment as
                #"favorable","unfavorable" or "on its merits"
                #but these are NOT votes (per conversation w
                #seceretary of the DE senate 3/16/15). The bill is
                #considered if the majority sign it, which will
                #appear in the bill's action history as being
                #reported out of committee

            elif title_text.startswith('voting'):
                self.find_vote(tds, vote_documents)
            elif title_text.startswith('actions history'):
                action_list = tds[1].text_content().split("\n")

        sub_versions = []
        use_sub = False
        if sub_link:
            bill = self.scrape_bill(sub_link, chamber, session)
            if bill:
                sub_versions = [v["url"] for v in bill["versions"]]
                bill.add_title(bill_id)
                use_sub = True

        if not use_sub:
            bill = Bill(session, chamber, bill_id, bill_title)

            for s in primary_sponsors:
                bill.add_sponsor("primary", s)

            for s in addl_sponsors:
                #it is not totally clear whether "additional sponsors"
                #are co or primary but primary is my best guess
                #based on the bill text, bc they're on the first
                #line with the primary sponsor
                bill.add_sponsor("primary", s)

            for s in cosponsors:
                bill.add_sponsor("cosponsor", s)

        for name, doc_link in bill_documents.items():
            if "Engrossment" in name or "Bill Text" in name:
                if doc_link not in sub_versions:
                    bill.add_version(name, doc_link, mimetype="text/html")
            else:
                pass
                bill.add_document(name, doc_link, mimetype="text/html")

        for a in action_list:
            if a.strip():
                date, action = a.split('-', 1)
                try:
                    date = datetime.strptime(date.strip(), '%b %d, %Y')
                except ValueError:
                    date = datetime.strptime(date.strip(),
                                             '%B %d, %Y')  # XXX: ugh.
                action = action.strip()
                actor = actions.get_actor(action, bill['chamber'])
                attrs = dict(actor=actor, action=action, date=date)
                attrs.update(**self.categorizer.categorize(action))
                attrs["action"] = " ".join(attrs["action"].split())
                bill.add_action(**attrs)

        for name, doc in vote_documents.items():
            vote_chamber = "lower" if "house" in name.lower() else "upper"
            try:
                self.head(doc)
            except requests.exceptions.HTTPError:
                self.logger.warning("could not access vote document")
                continue
            vote_page = self.lxmlize(doc)
            vote_info = vote_page.xpath(".//div[@id='page_content']/p")[-1]
            yes_votes = []
            no_votes = []
            other_votes = []
            lines = vote_info.text_content().split("\n")
            for line in lines:
                if line.strip().startswith("Date"):
                    date_str = " ".join(line.split()[1:4])
                    date = datetime.strptime(date_str, "%m/%d/%Y %I:%M %p")
                    passage_status = line.strip().split()[-1]

                    #we've never seen a vote with anything but "passed"
                    #so throw an error otherwise so we can figure it out
                    passed_statuses = ["Passed"]
                    failed_statuses = ["Defeated", "Rescinded"]
                    if passage_status not in passed_statuses + failed_statuses:
                        raise AssertionError(
                            "Unknown passage state {}".format(passage_status))
                    passed = passage_status in passed_statuses

                if line.strip().startswith("Vote Type"):
                    if "voice" in line.lower():
                        voice_vote = True
                    else:
                        voice_vote = False
                        yes_count = int(re.findall("Yes: (\d+)", line)[0])
                        no_count = int(re.findall("No: (\d+)", line)[0])
                        other_count = int(
                            re.findall("Not Voting: (\d+)", line)[0])
                        other_count += int(
                            re.findall("Absent: (\d+)", line)[0])
                        vote_tds = vote_page.xpath(".//table//td")
                        person_seen = False
                        for td in vote_tds:
                            if person_seen:
                                person_vote = td.text_content().strip()
                                if person_vote == "Y":
                                    yes_votes.append(person)
                                elif person_vote == "N":
                                    no_votes.append(person)
                                elif person_vote in ["NV", "A", "X", "C"]:
                                    other_votes.append(person)
                                else:
                                    raise AssertionError(
                                        "Unknown vote '{}'".format(
                                            person_vote))
                                person_seen = False
                            else:
                                person = td.text_content().strip()
                                if person:
                                    person_seen = True

            if voice_vote:
                vote = Vote(vote_chamber, date, "passage", passed, 0, 0, 0)
            else:
                vote = Vote(vote_chamber,
                            date,
                            "passage",
                            passed,
                            yes_count,
                            no_count,
                            other_count,
                            yes_votes=[],
                            no_votes=[],
                            other_votes=[])

                vote["yes_votes"] = yes_votes
                vote["no_votes"] = no_votes
                vote["other_votes"] = other_votes

            if (passed and vote["yes_count"] <= vote["no_count"]
                    and not voice_vote):
                raise AssertionError("Vote passed with more N than Y votes?")

            if not passed and vote["yes_count"] > vote["no_count"]:
                self.logger.warning("Vote did not pass but had a majority \
                        probably worth checking")

            if "Amendment" in name:
                vote["type"] = "amendment"
            else:
                vote["type"] = "passage"
            vote.add_source(doc)
            bill.add_vote(vote)

        bill.add_source(link)

        return bill
Esempio n. 42
0
    def scrape_bill_type(self, chamber, session, bill_type, type_abbr):
        if chamber == 'upper':
            chamber_name = 'SENATE'
        else:
            chamber_name = 'ASSEMBLY'

        bills = self.session.query(CABill).filter_by(
            session_year=session).filter_by(
            measure_type=type_abbr)


        for bill in bills:
            bill_session = session
            if bill.session_num != '0':
                bill_session += ' Special Session %s' % bill.session_num

            bill_id = bill.short_bill_id

            fsbill = Bill(bill_session, chamber, bill_id, '')

            # Construct session for web query, going from '20092010' to '0910'
            source_session = session[2:4] + session[6:8]

            # Turn 'AB 10' into 'ab_10'
            source_num = "%s_%s" % (bill.measure_type.lower(),
                                    bill.measure_num)

            # Construct a fake source url
            source_url = ("http://www.leginfo.ca.gov/cgi-bin/postquery?"
                          "bill_number=%s&sess=%s" %
                          (source_num, source_session))

            fsbill.add_source(source_url)

            scraped_versions = self.scrape_site_versions(source_url)

            title = ''
            short_title = ''
            type = ['bill']
            subject = ''
            all_titles = set()
            i = 0
            for version in bill.versions:
                if not version.bill_xml:
                    continue

                title = clean_title(version.title)
                if title:
                    all_titles.add(title)
                short_title = clean_title(version.short_title)
                type = [bill_type]

                if version.appropriation == 'Yes':
                    type.append('appropriation')
                if version.fiscal_committee == 'Yes':
                    type.append('fiscal committee')
                if version.local_program == 'Yes':
                    type.append('local program')
                if version.urgency == 'Yes':
                    type.append('urgency')
                if version.taxlevy == 'Yes':
                    type.append('tax levy')

                if version.subject:
                    subject = clean_title(version.subject)

                date = version.bill_version_action_date.date()

                url = ''
                try:
                    scraped_version = scraped_versions[i]
                    if scraped_version[0] == date:
                        url = scraped_version[1]
                        i += 1
                except IndexError:
                    pass

                fsbill.add_version(
                    version.bill_version_id, url,
                    date=date,
                    title=title,
                    short_title=short_title,
                    subject=[subject],
                    type=type)

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill['title'] = title
            fsbill['short_title'] = short_title
            fsbill['type'] = type
            fsbill['subjects'] = [subject]

            # We don't want the current title in alternate_titles
            all_titles.remove(title)

            fsbill['alternate_titles'] = list(all_titles)

            for author in version.authors:
                if author.house == chamber_name:
                    fsbill.add_sponsor(author.contribution, author.name)

            introduced = False

            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r'(Assembly|Senate)($| \(Floor)', actor)
                if match:
                    actor = {'Assembly': 'lower',
                             'Senate': 'upper'}[match.group(1)]
                elif actor.startswith('Governor'):
                    actor = 'executive'
                else:
                    actor = re.sub('^Assembly', 'lower', actor)
                    actor = re.sub('^Senate', 'upper', actor)

                type = []

                act_str = action.action
                act_str = re.sub(r'\s+', ' ', act_str)

                if act_str.startswith('Introduced'):
                    introduced = True
                    type.append('bill:introduced')

                if 'Read first time.' in act_str:
                    if not introduced:
                        type.append('bill:introduced')
                        introduced = True
                    type.append('bill:reading:1')

                if 'To Com' in act_str or 'referred to' in act_str.lower():
                    type.append('committee:referred')

                if 'Read third time.  Passed.' in act_str:
                    type.append('bill:passed')

                if 'Approved by Governor' in act_str:
                    type.append('governor:signed')

                if 'Item veto' in act_str:
                    type.append('governor:vetoed:line-item')

                if 'Vetoed by Governor' in act_str:
                    type.append('governor:vetoed')

                if 'To Governor' in act_str:
                    type.append('governor:received')

                if 'Read second time' in act_str:
                    type.append('bill:reading:2')

                if not type:
                    type = ['other']

                fsbill.add_action(actor, act_str, action.action_date.date(),
                                  type=type)

            for vote in bill.votes:
                if vote.vote_result == '(PASS)':
                    result = True
                else:
                    result = False

                full_loc = vote.location.description
                first_part = full_loc.split(' ')[0].lower()
                if first_part in ['asm', 'assembly']:
                    vote_chamber = 'lower'
                    vote_location = ' '.join(full_loc.split(' ')[1:])
                elif first_part.startswith('sen'):
                    vote_chamber = 'upper'
                    vote_location = ' '.join(full_loc.split(' ')[1:])
                else:
                    raise ScrapeError("Bad location: %s" % full_loc)

                motion = vote.motion.motion_text or ''

                if "Third Reading" in motion or "3rd Reading" in motion:
                    vtype = 'passage'
                elif "Do Pass" in motion:
                    vtype = 'passage'
                else:
                    vtype = 'other'

                motion = motion.strip()

                # Why did it take until 2.7 to get a flags argument on re.sub?
                motion = re.compile(r'(\w+)( Extraordinary)? Session$',
                                    re.IGNORECASE).sub('', motion)
                motion = re.compile(r'^(Senate|Assembly) ',
                                    re.IGNORECASE).sub('', motion)
                motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ',
                                '', motion)
                motion = re.sub(r' \(\w+\)$', '', motion)
                motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$',
                                '', motion)
                motion = re.sub(r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? '
                                r'Urgency Clause$',
                                '(Urgency Clause)', motion)
                motion = re.sub(r'\s+', ' ', motion)

                if not motion:
                    self.warning("Got blank motion on vote for %s" % bill_id)
                    continue

                fsvote = Vote(vote_chamber,
                              self._tz.localize(vote.vote_date_time),
                              motion,
                              result,
                              int(vote.ayes),
                              int(vote.noes),
                              int(vote.abstain),
                              threshold=vote.threshold,
                              type=vtype)

                if vote_location != 'Floor':
                    fsvote['committee'] = vote_location

                for record in vote.votes:
                    if record.vote_code == 'AYE':
                        fsvote.yes(record.legislator_name)
                    elif record.vote_code.startswith('NO'):
                        fsvote.no(record.legislator_name)
                    else:
                        fsvote.other(record.legislator_name)

                # The abstain count field in CA's database includes
                # vacancies, which we aren't interested in.
                fsvote['other_count'] = len(fsvote['other_votes'])

                fsbill.add_vote(fsvote)

            self.save_bill(fsbill)
Esempio n. 43
0
    def scrape_bill_type(self, chamber, session, bill_type, type_abbr,
            committee_abbr_regex=get_committee_name_regex()):

        if chamber == 'upper':
            chamber_name = 'SENATE'
        else:
            chamber_name = 'ASSEMBLY'

        bills = self.session.query(CABill).filter_by(
            session_year=session).filter_by(
            measure_type=type_abbr)

        for bill in bills:
            bill_session = session
            if bill.session_num != '0':
                bill_session += ' Special Session %s' % bill.session_num

            bill_id = bill.short_bill_id

            fsbill = Bill(bill_session, chamber, bill_id, '')

            # # Construct session for web query, going from '20092010' to '0910'
            # source_session = session[2:4] + session[6:8]

            # # Turn 'AB 10' into 'ab_10'
            # source_num = "%s_%s" % (bill.measure_type.lower(),
            #                         bill.measure_num)

            # Construct a fake source url
            source_url = ('http://leginfo.legislature.ca.gov/faces/'
                          'billNavClient.xhtml?bill_id=%s') % bill.bill_id

            fsbill.add_source(source_url)
            fsbill.add_version(bill_id, source_url, 'text/html')

            title = ''
            type_ = ['bill']
            subject = ''
            all_titles = set()

            # Get digest test (aka "summary") from latest version.
            if bill.versions:
                version = bill.versions[-1]
                nsmap = version.xml.nsmap
                xpath = '//caml:DigestText/xhtml:p'
                els = version.xml.xpath(xpath, namespaces=nsmap)
                chunks = []
                for el in els:
                    t = etree_text_content(el)
                    t = re.sub(r'\s+', ' ', t)
                    t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t)
                    chunks.append(t)
                summary = '\n\n'.join(chunks)

            for version in bill.versions:
                if not version.bill_xml:
                    continue

                # CA is inconsistent in that some bills have a short title
                # that is longer, more descriptive than title.
                if bill.measure_type in ('AB', 'SB'):
                    impact_clause = clean_title(version.title)
                    title = clean_title(version.short_title)
                else:
                    impact_clause = None
                    if len(version.title) < len(version.short_title) and \
                            not version.title.lower().startswith('an act'):
                        title = clean_title(version.short_title)
                    else:
                        title = clean_title(version.title)

                if title:
                    all_titles.add(title)

                type_ = [bill_type]

                if version.appropriation == 'Yes':
                    type_.append('appropriation')
                if version.fiscal_committee == 'Yes':
                    type_.append('fiscal committee')
                if version.local_program == 'Yes':
                    type_.append('local program')
                if version.urgency == 'Yes':
                    type_.append('urgency')
                if version.taxlevy == 'Yes':
                    type_.append('tax levy')

                if version.subject:
                    subject = clean_title(version.subject)

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill['title'] = title
            fsbill['summary'] = summary
            fsbill['type'] = type_
            fsbill['subjects'] = filter(None, [subject])
            fsbill['impact_clause'] = impact_clause

            # We don't want the current title in alternate_titles
            all_titles.remove(title)

            fsbill['alternate_titles'] = list(all_titles)

            for author in version.authors:
                if author.house == chamber_name:
                    fsbill.add_sponsor(SPONSOR_TYPES[author.contribution],
                                       author.name,
                                       official_type=author.contribution)

            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r'(Assembly|Senate)($| \(Floor)', actor)
                if match:
                    actor = {'Assembly': 'lower',
                             'Senate': 'upper'}[match.group(1)]
                elif actor.startswith('Governor'):
                    actor = 'other'
                else:
                    def replacer(matchobj):
                        if matchobj:
                            return {'Assembly': 'lower',
                                    'Senate': 'upper'}[matchobj.group()]
                        else:
                            return matchobj.group()

                    actor = re.sub(r'^(Assembly|Senate)', replacer, actor)

                type_ = []

                act_str = action.action
                act_str = re.sub(r'\s+', ' ', act_str)

                attrs = self.categorizer.categorize(act_str)

                # Add in the committee strings of the related committees, if any.
                kwargs = attrs
                matched_abbrs = committee_abbr_regex.findall(action.action)

                if 'Com. on' in action.action and not matched_abbrs:
                    msg = 'Failed to extract committee abbr from %r.'
                    self.logger.warning(msg % action.action)

                if matched_abbrs:
                    committees = []
                    for abbr in matched_abbrs:
                        try:
                            name = self.committee_abbr_to_name(chamber, abbr)
                        except KeyError:
                            msg = ('Mapping contains no committee name for '
                                   'abbreviation %r. Action text was %r.')
                            args = (abbr, action.action)
                            raise KeyError(msg % args)
                        else:
                            committees.append(name)

                    committees = filter(None, committees)
                    kwargs['committees'] = committees

                    code = re.search(r'C[SXZ]\d+', actor)
                    if code is not None:
                        code = code.group()
                        kwargs['actor_info'] = {'committee_code': code}

                    assert len(committees) == len(matched_abbrs)
                    for committee, abbr in zip(committees, matched_abbrs):
                        act_str = act_str.replace('Com. on ' + abbr, committee)
                        act_str = act_str.replace(abbr, committee)

                changed = False
                for string in ['upper', 'lower', 'joint']:
                    if actor.startswith(string):
                        actor = string
                        changed = True
                        break
                if not changed:
                    actor = 'other'
                if actor != action.actor:
                    actor_info = kwargs.get('actor_info', {})
                    actor_info['details'] = action.actor
                    kwargs['actor_info'] = actor_info

                # Add strings for related legislators, if any.
                rgx = '(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+'
                legislators = re.findall(rgx, action.action, re.I)
                if legislators:
                    kwargs['legislators'] = legislators

                fsbill.add_action(actor, act_str, action.action_date.date(),
                                  **kwargs)

            for vote in bill.votes:
                if vote.vote_result == '(PASS)':
                    result = True
                else:
                    result = False

                full_loc = vote.location.description
                first_part = full_loc.split(' ')[0].lower()
                if first_part in ['asm', 'assembly']:
                    vote_chamber = 'lower'
                    vote_location = ' '.join(full_loc.split(' ')[1:])
                elif first_part.startswith('sen'):
                    vote_chamber = 'upper'
                    vote_location = ' '.join(full_loc.split(' ')[1:])
                else:
                    raise ScrapeError("Bad location: %s" % full_loc)

                if vote.motion:
                    motion = vote.motion.motion_text or ''
                else:
                    motion = ''

                if "Third Reading" in motion or "3rd Reading" in motion:
                    vtype = 'passage'
                elif "Do Pass" in motion:
                    vtype = 'passage'
                else:
                    vtype = 'other'

                motion = motion.strip()

                # Why did it take until 2.7 to get a flags argument on re.sub?
                motion = re.compile(r'(\w+)( Extraordinary)? Session$',
                                    re.IGNORECASE).sub('', motion)
                motion = re.compile(r'^(Senate|Assembly) ',
                                    re.IGNORECASE).sub('', motion)
                motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ',
                                '', motion)
                motion = re.sub(r' \(\w+\)$', '', motion)
                motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$',
                                '', motion)
                motion = re.sub(r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? '
                                r'Urgency Clause$',
                                '(Urgency Clause)', motion)
                motion = re.sub(r'\s+', ' ', motion)

                if not motion:
                    self.warning("Got blank motion on vote for %s" % bill_id)
                    continue

                fsvote = Vote(vote_chamber,
                              self._tz.localize(vote.vote_date_time),
                              motion,
                              result,
                              int(vote.ayes),
                              int(vote.noes),
                              int(vote.abstain),
                              threshold=vote.threshold,
                              type_=vtype)

                if vote_location != 'Floor':
                    fsvote['committee'] = vote_location

                for record in vote.votes:
                    if record.vote_code == 'AYE':
                        fsvote.yes(record.legislator_name)
                    elif record.vote_code.startswith('NO'):
                        fsvote.no(record.legislator_name)
                    else:
                        fsvote.other(record.legislator_name)

                for s in ('yes', 'no', 'other'):
                    # Kill dupe votes.
                    key = s + '_votes'
                    fsvote[key] = list(set(fsvote[key]))

                # In a small percentage of bills, the integer vote counts
                # are inaccurate, so let's ignore them.
                for k in ('yes', 'no', 'other'):
                    fsvote[k + '_count'] = len(fsvote[k + '_votes'])

                fsbill.add_vote(fsvote)

            self.save_bill(fsbill)
Esempio n. 44
0
    def scrape_bill(self, chamber, session, bill_id, bill_type):
        url = '%s?r=%s' % (self.base_url, bill_id)
        html = self.get(url).text
        if "error '80020009'" in html:
            self.warning('asp error on page, skipping %s', bill_id)
            return
        doc = lxml.html.fromstring(html)
        # search for Titulo, accent over i messes up lxml, so use 'tulo'
        title = doc.xpath(u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()')
        if not title:
            raise NoSuchBill()
        bill = Bill(session, chamber, bill_id, title[0], type=bill_type)
        author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0]
        for aname in author.split(','):
            aname = self.clean_name(aname).strip()
            if aname:
                bill.add_sponsor('primary', aname)
        co_authors = doc.xpath(u'//td/b[contains(text(),"Co-autor")]/../text()')
        if len(co_authors) != 0:
            for co_author in co_authors[1].split(','):
                bill.add_sponsor('cosponsor', self.clean_name(co_author).strip());
        action_table = doc.xpath('//table')[-1]
        for row in action_table[1:]:
            tds = row.xpath('td')
            # ignore row missing date
            if len(tds) != 2:
                continue
            if tds[0].text_content():
                date = datetime.datetime.strptime(tds[0].text_content(), "%m/%d/%Y")
            action = tds[1].text_content().strip()
            #parse the text to see if it's a new version or a unrelated document
            #if has a hyphen let's assume it's a vote document

            #get url of action
            action_url = tds[1].xpath('a/@href')
            atype,action = self.parse_action(chamber,bill,action,action_url,date)

            # Some lower-house roll calls could be parsed, but finnicky
            # Most roll lists are just images embedded within a document,
            # and offer no alt text to scrape
            # Instead, just scrape the vote counts
            vote_info = re.search(r'(?u)^(.*),\s([\s\d]{2})-([\s\d]{2})-([\s\d]{2})-([\s\d]{0,2})$', action)
            if vote_info and re.search(r'\d{1,2}', action):
                vote_name = vote_info.group(1)

                if u"Votación Final" in vote_name:
                    (vote_chamber, vote_name) = re.search(
                            r'(?u)^\w+ por (.*?) en (.*)$', vote_name).groups()
                    if "Senado" in vote_chamber:
                        vote_chamber = 'upper'
                    else:
                        vote_chamber = 'lower'

                elif "Cuerpo de Origen" in vote_name:
                    vote_name = re.search(
                            r'(?u)^Cuerpo de Origen (.*)$', vote_name).group(1)
                    vote_chamber = chamber

                elif u"informe de Comisión de Conferencia" in vote_name:
                    (vote_chamber, vote_name) = re.search(
                            r'(?u)^(\w+) (\w+ informe de Comisi\wn de Conferencia)$',
                            vote_name).groups()
                    if vote_chamber == "Senado":
                        vote_chamber = 'upper'
                    else:
                        vote_chamber = 'lower'

                elif u"Se reconsideró" in vote_name:
                    if bill['votes']:
                        vote_chamber = bill['votes'][-1]['chamber']
                    else:
                        vote_chamber = chamber

                else:
                    raise AssertionError(
                            u"Unknown vote text found: {}".format(vote_name))

                vote_name = vote_name.title()

                yes = int(vote_info.group(2))
                no = int(vote_info.group(3))
                other = 0
                if vote_info.group(4).strip():
                    other += int(vote_info.group(4))
                if vote_info.group(5).strip():
                    other += int(vote_info.group(5))

                vote = Vote(
                        chamber=vote_chamber,
                        date=date,
                        motion=vote_name,
                        passed=(yes > no),
                        yes_count=yes,
                        no_count=no,
                        other_count=other
                        )
                vote.add_source(url)
                bill.add_vote(vote)

        bill.add_source(url)
        self.save_bill(bill)
Esempio n. 45
0
    def scrape_bill(self, chamber, session, url):
        url = url + "&Year=%s" % session
        with self.urlopen(url) as page:
            page = page.replace('&nbsp;', ' ').replace('<br>', '\n')
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            title = page.xpath('//h3')[0].text.strip()
            title = re.match(r"^\w+\s+\d+:\s+(.*)$", title).group(1)

            bill_id = page.xpath("string(//pre[@class='billhistory']/b)")
            bill_id = bill_id.split()[0].strip()

            bill = Bill(session, chamber, bill_id, title)
            bill.add_source(url)

            hist = page.xpath("string(//pre[@class='billhistory'])")
            act_re = re.compile(r'^\s+(\d\d/\d\d/\d\d) (SENATE|HOUSE|\s)'
                                r'([^\n]+\n?(\s{16,16}.*\n){0,})',
                                re.MULTILINE)

            # Actions
            for match in act_re.finditer(hist):
                action = match.group(3).replace('\n', ' ')
                action = re.sub(r'\s+', ' ', action).strip()

                actor = match.group(2)
                if actor == 'SENATE':
                    actor = 'upper'
                elif actor == 'HOUSE':
                    actor = 'lower'
                else:
                    actor = 'executive'

                date = match.group(1)
                date = datetime.datetime.strptime(date, "%m/%d/%y")

                for act_text in re.split(' -[HS]J \d+;? ?', action):
                    act_text = act_text.strip()
                    if not act_text:
                        continue

                    types = []
                    act_lower = act_text.lower()
                    if act_lower.startswith('introduced'):
                        types.append('bill:introduced')
                    if 'referred to' in act_lower:
                        types.append('committee:referred')
                    if 'died in committee' in act_lower:
                        types.append('committee:failed')
                    if 'favorable by' in act_lower:
                        types.append('committee:passed:favorable')
                    if 'amendment(s) adopted' in act_lower:
                        types.append('amendment:passed')

                    bill.add_action(actor, act_text, date, type=types)

            # Sponsors
            primary_sponsor = re.search(r'by ([^;(\n]+;?|\w+)',
                                        hist).group(1).strip('; ')
            bill.add_sponsor('primary', primary_sponsor)

            cospon_re = re.compile(r'\((CO-SPONSORS|CO-AUTHORS)\) '
                                   '([\w .]+(;[\w .\n]+){0,})',
                                   re.MULTILINE)
            match = cospon_re.search(hist)

            if match:
                for cosponsor in match.group(2).split(';'):
                    cosponsor = cosponsor.replace('\n', '').strip()
                    bill.add_sponsor('cosponsor', cosponsor)

            # Versions
            for link in page.xpath("//a[contains(@href, 'billtext/html')]"):
                version = link.xpath('string(../../td[1])').strip()

                bill.add_version(version, link.attrib['href'])

            # House Votes
            for link in page.xpath("//a[contains(@href, 'votes/html/h')]"):
                bill.add_vote(self.scrape_lower_vote(link.attrib['href']))

            # Senate Votes
            for link in page.xpath("//a[contains(@href, 'votes/html/S')]"):
                bill.add_vote(self.scrape_upper_vote(link.attrib['href']))

            self.save_bill(bill)
Esempio n. 46
0
    def scrape(self, chamber, session):
        for term in self.metadata['terms']:
            if term['sessions'][0] == session:
                year = str(term['start_year'])
                year2 = str(term['end_year'])
                break
        else:
            raise NoDataForPeriod(session)

        if chamber == 'upper':
            bill_abbr = 'SB|SCR|SJR'
        elif chamber == 'lower':
            bill_abbr = 'HB|HCR|HJR'

        # Full calendar year
        date1 = '0101' + year[2:]
        date2 = '1231' + year2[2:]

        # Get bill list
        bill_list_url = 'http://www.legis.state.ak.us/'\
            'basis/range_multi.asp?session=%s&date1=%s&date2=%s' % (
            session, date1, date2)
        self.log("Getting bill list for %s %s (this may take a long time)." %
                 (chamber, session))
        bill_list = self.soup_parser(self.urlopen(bill_list_url))

        # Find bill links
        re_str = "bill=%s\d+" % bill_abbr
        links = bill_list.findAll(href=re.compile(re_str))

        for link in links:
            bill_id = link.contents[0].replace(' ', '')
            bill_name = link.parent.parent.findNext('td').find(
                'font').contents[0].strip()

            if bill_id.startswith('HB') or bill_id.startswith('SB'):
                btype = ['bill']
            elif bill_id.startswith('SJR') or bill_id.startswith('HJR'):
                btype = ['joint resolution']
            elif bill_id.startswith('SR') or bill_id.startswith('HR'):
                btype = ['resolution']
            elif bill_id.startswith('SCR') or bill_id.startswith('HCR'):
                btype = ['concurrent resolution']

            if re.match(r'CONST\.? AM:', bill_name):
                btype.append('constitutional amendment')

            bill = Bill(session, chamber, bill_id, bill_name, type=btype)

            # Get the bill info page and strip malformed t
            info_url = "http://www.legis.state.ak.us/basis/%s" % link['href']
            info_page = self.soup_parser(self.urlopen(info_url))
            bill.add_source(info_url)

            # Get sponsors
            spons_str = info_page.find(
                text="SPONSOR(s):").parent.parent.contents[1]
            sponsors_match = re.match(
                ' (SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})',
                spons_str)
            if sponsors_match:
                sponsors = sponsors_match.group(2).split(',')
                sponsor = sponsors[0].strip()

                if sponsor:
                    bill.add_sponsor('primary', sponsors[0])

                for sponsor in sponsors[1:]:
                    sponsor = sponsor.strip()
                    if sponsor:
                        bill.add_sponsor('cosponsor', sponsor)
            else:
                # Committee sponsorship
                spons_str = spons_str.strip()

                if re.match(r' BY REQUEST OF THE GOVERNOR$', spons_str):
                    spons_str = re.sub(r' BY REQUEST OF THE GOVERNOR$',
                                       '', spons_str).title()
                    spons_str = (spons_str +
                                 " Committee (by request of the governor)")

                if spons_str:
                    bill.add_sponsor('committee', spons_str)

            # Get actions
            self._current_comm = None
            act_rows = info_page.findAll('table', 'myth')[1].findAll('tr')[1:]
            for row in act_rows:
                cols = row.findAll('td')
                act_date = cols[0].font.contents[0]
                act_date = dt.datetime.strptime(act_date, '%m/%d/%y')

                if cols[2].font.string == "(H)":
                    act_chamber = "lower"
                elif cols[2].font.string == "(S)":
                    act_chamber = "upper"
                else:
                    act_chamber = chamber

                action = cols[3].font.contents[0].strip()
                if re.match("\w+ Y(\d+) N(\d+)", action):
                    try:
                        vote = self.parse_vote(bill, action,
                                               act_chamber, act_date,
                                               cols[1].a['href'])
                        bill.add_vote(vote)
                    except:
                        self.log("Failed parsing vote")

                action, atype = self.clean_action(action)

                match = re.match('^Prefile released (\d+/\d+/\d+)$', action)
                if match:
                    action = 'Prefile released'
                    act_date = dt.datetime.strptime(match.group(1),
                                                    '%m/%d/%y')

                bill.add_action(act_chamber, action, act_date, type=atype)

            # Get subjects
            bill['subjects'] = []
            subject_link_re = re.compile('.*subject=\w+$')
            for subject_link in info_page.findAll('a', href=subject_link_re):
                subject = subject_link.contents[0].strip()
                bill['subjects'].append(subject)

            # Get versions
            text_list_url = "http://www.legis.state.ak.us/"\
                "basis/get_fulltext.asp?session=%s&bill=%s" % (
                session, bill_id)
            text_list = self.soup_parser(self.urlopen(text_list_url))
            bill.add_source(text_list_url)

            text_link_re = re.compile('^get_bill_text?')
            for text_link in text_list.findAll('a', href=text_link_re):
                text_name = text_link.parent.previousSibling.contents[0]
                text_name = text_name.strip()

                text_url = "http://www.legis.state.ak.us/basis/%s" % (
                    text_link['href'])

                bill.add_version(text_name, text_url)

            self.save_bill(bill)
Esempio n. 47
0
    def scrape_bill_sheet(self, session, chamber):
        """
        Scrape the bill sheet (the page full of bills and other small bits of data)
        """
        sheet_url = self.get_bill_folder(session, chamber)

        bill_chamber = {"Senate": "upper", "House": "lower"}[chamber]

        index = {
            "id": 0,
            "title_sponsor": 1,
            "version": 2,
            "history": 3,
            "votes": 7
        }

        with self.urlopen(sheet_url) as sheet_html:
            sheet_page = lxml.html.fromstring(sheet_html)

            bills = sheet_page.xpath('//table/tr')

            for bill in bills:
                bill_id = self.read_td(bill[index["id"]][0])

                if bill_id == None:
                    # Every other entry is null for some reason
                    continue

                dot_loc = bill_id.find('.')
                if dot_loc != -1:
                    # budget bills are missing the .pdf, don't truncate
                    bill_id = bill_id[:dot_loc]
                title_and_sponsor = bill[index["title_sponsor"]][0]

                bill_title = title_and_sponsor.text
                bill_title_and_sponsor = title_and_sponsor.text_content()
                sponsors = bill_title_and_sponsor.replace(bill_title, "").\
                    replace(" & ...", "").split("--")

                cats = {
                    "SB": "bill",
                    "HB": "bill",
                    "HR": "resolution",
                    "SR": "resolution",
                    "SCR": "concurrent resolution",
                    "HCR": "concurrent resolution",
                    "SJR": "joint resolution",
                    "HJR": "joint resolution",
                    "SM": "memorial",
                    "HM": "memorial"
                }

                bill_type = None

                for cat in cats:
                    if bill_id[:len(cat)] == cat:
                        bill_type = cats[cat]

                b = Bill(session,
                         bill_chamber,
                         bill_id,
                         bill_title,
                         type=bill_type)

                b.add_source(sheet_url)

                versions_url = \
                    bill[index["version"]].xpath('font/a')[0].attrib["href"]
                versions_url = CO_URL_BASE + versions_url
                versions = self.parse_versions(versions_url)
                for version in versions:
                    b.add_version(version['name'],
                                  version['link'],
                                  mimetype=version['mimetype'])

                bill_history_href = CO_URL_BASE + \
                    bill[index["history"]][0][0].attrib['href']
                # ^^^^^^^ We assume this is a full path to the target.
                # might want to consider some better rel-path support
                # XXX: Look at this ^

                history = self.parse_history(bill_history_href)
                b.add_source(bill_history_href)

                for action in history:
                    self.add_action_to_bill(b, action)

                for sponsor in sponsors:
                    if sponsor != None and sponsor != "(NONE)" and \
                       sponsor != "":
                        b.add_sponsor("primary", sponsor)

                # Now that we have history, let's see if we can't grab some
                # votes

                bill_vote_href = self.get_vote_url(bill_id, session)
                votes = self.parse_votes(bill_vote_href)

                if votes['sanity-check'] != bill_id:
                    self.warning("XXX: READ ME! Sanity check failed!")
                    self.warning(" -> Scraped ID: " + votes['sanity-check'])
                    self.warning(" -> 'Real' ID:  " + bill_id)
                    assert votes['sanity-check'] == bill_id

                for vote in votes['votes']:
                    filed_votes = vote['votes']
                    passage = vote['meta']
                    result = vote['result']

                    composite_time = "%s %s" % (passage['x-parent-date'],
                                                passage['TIME'])
                    # It's now like: 04/01/2011 02:10:14 PM
                    pydate = dt.datetime.strptime(composite_time,
                                                  "%m/%d/%Y %I:%M:%S %p")
                    hasHouse = "House" in passage['x-parent-ctty']
                    hasSenate = "Senate" in passage['x-parent-ctty']

                    if hasHouse and hasSenate:
                        actor = "joint"
                    elif hasHouse:
                        actor = "lower"
                    else:
                        actor = "upper"

                    other = (int(result['EXC']) + int(result['ABS']))
                    # OK, sometimes the Other count is wrong.
                    local_other = 0
                    for voter in filed_votes:
                        l_vote = filed_votes[voter].lower().strip()
                        if l_vote != "yes" and l_vote != "no":
                            local_other = local_other + 1

                    if local_other != other:
                        self.warning( \
                            "XXX: !!!WARNING!!! - resetting the 'OTHER' VOTES" )
                        self.warning(" -> Old: %s // New: %s" %
                                     (other, local_other))
                        other = local_other

                    v = Vote(actor,
                             pydate,
                             passage['MOTION'],
                             (result['FINAL_ACTION'] == "PASS"),
                             int(result['YES']),
                             int(result['NO']),
                             other,
                             moved=passage['MOVED'],
                             seconded=passage['SECONDED'])

                    v.add_source(vote['meta']['url'])
                    # v.add_source( bill_vote_href )

                    # XXX: Add more stuff to kwargs, we have a ton of data
                    for voter in filed_votes:
                        who = voter
                        vote = filed_votes[who]
                        if vote.lower() == "yes":
                            v.yes(who)
                        elif vote.lower() == "no":
                            v.no(who)
                        else:
                            v.other(who)
                    b.add_vote(v)
                self.save_bill(b)
Esempio n. 48
0
    def scrape_xml(self, chamber, session):
        start_letter = 'S' if chamber == 'upper' else 'H'
        sponsor_type_dict = {'3': 'cosponsor', '4': 'primary', '5': 'primary'}
        version_url = 'http://www1.legis.ga.gov/legis/%s/versions/' % session

        summary_url = (
            'http://www1.legis.ga.gov/legis/%s/list/BillSummary.xml' % session)
        xml = self.urlopen(summary_url).bytes
        doc = lxml.etree.fromstring(xml)

        for bxml in doc.xpath('//Bill'):
            type = bxml.get('Type')

            # if this is from the other chamber skip it
            if not type.startswith(start_letter):
                continue

            bill_id = type + bxml.get('Num') + bxml.get('Suffix')
            if type in ('HB', 'SB'):
                type = 'bill'
            elif type in ('HR', 'SR'):
                type = 'resolution'
            else:
                raise ValueError('unknown type: %s' % type)

            # use short_title as title and long as summary
            title = bxml.xpath('Short_Title/text()')
            summary = bxml.xpath('Title/text()')

            if summary:
                summary = summary[0]
            else:
                summary = ''

            if title:
                title = title[0]
            else:
                title = summary
                summary = ''

            if not title and not summary:
                self.warning('no title or summary for %s, skipping' % bill_id)
                continue

            bill = Bill(session,
                        chamber,
                        bill_id,
                        title,
                        type=type,
                        summary=summary)
            bill_url = 'http://www1.legis.ga.gov/legis/%s/sum/%s.htm' % (
                session, bill_id.lower())
            bill.add_source(bill_url)
            bill.add_source(summary_url)

            # get votes from ids
            bhtml = lxml.html.fromstring(self.urlopen(bill_url))
            vote_links = bhtml.xpath('//a[contains(@href, "/votes/")]/@href')
            vote_ids = [l.rsplit('/')[-1].split('.')[0] for l in vote_links]
            for vid in vote_ids:
                bill.add_vote(self.votes[vid])

            for sponsor in bxml.xpath('Sponsor'):
                sponsor_name, code = sponsor.text.rsplit(' ', 1)
                sponsor_name = sponsor_name.replace(',', ', ')
                bill.add_sponsor(sponsor_type_dict[sponsor.get('Type')],
                                 sponsor_name,
                                 _code=code)

            for version in bxml.xpath('Versions/Version'):
                # NOTE: it is possible to get PDF versions by using .get('Id')
                # ex. URL:  legis.ga.gov/Legislation/20112012/108025.pdf
                # for now we just get HTML
                description, file_id = version.xpath('*/text()')
                bill.add_version(description,
                                 version_url + file_id,
                                 mimetype='text/html')

            for action in bxml.xpath('StatusHistory/Status'):
                date = datetime.datetime.strptime(action.get('StatusDate'),
                                                  "%Y-%m-%dT%H:%M:%S")
                code = action.get('StatusCode')
                if code in ('EFF', 'Signed Gov'):
                    actor = 'executive'
                elif code[0] == 'S':
                    actor = 'upper'
                elif code[0] == 'H':
                    actor = 'lower'

                try:
                    atype = self._action_codes[code]
                except KeyError:
                    self.warning("unknown action code %s on %s" %
                                 (code, action.text))

                bill.add_action(actor, action.text, date, atype)

            self.save_bill(bill)
Esempio n. 49
0
    def scrape_pre_2009_bill(self, chamber, session, bill_id, short_title=''):
        """bills from 2008 and below are in a 'pre' element and is simpler to
        parse them as text"""
        url = 'http://legislature.idaho.gov/legislation/%s/%s.html' % (session, bill_id.replace(' ', ''))
        bill_page = self.get(url).text
        html = lxml.html.fromstring(bill_page)
        text = html.xpath('//pre')[0].text.split('\r\n')

        # title
        title = " - ".join([x.strip() for x in text[1].split('-') if x.isupper()])
        # bill type
        bill_type = get_bill_type(bill_id)

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        # sponsors
        sponsors = text[0].split('by')[-1]
        for sponsor in sponsors.split(','):
            bill.add_sponsor('primary', sponsor)

        actor = chamber
        self.flag()  # clear last bills vote flags
        self.vote = None  #

        for line in text:

            if re.match(r'^\d\d/\d\d', line):
                date = date = datetime.datetime.strptime(line[0:5] + '/' + session[0:4],
                                              "%m/%d/%Y")
                self.last_date = date
                action_text = line[5:].strip()
                # actor
                if action_text.lower().startswith('house') or \
                   action_text.lower().startswith('senate'):
                    actor = {'H': 'lower', 'S': 'upper'}[action_text[0]]

                action = get_action(actor, action_text)
                bill.add_action(actor, action_text, date, type=action)
                if "bill:passed" in action or "bill:failed" in action:
                    passed = False if 'FAILED' in action_text else True
                    votes = re.search(r'(\d+)-(\d+)-(\d+)', action_text)
                    if votes:
                        yes, no, other = votes.groups()
                        self.in_vote = True
                        self.vote = Vote(chamber, date, action_text, passed,
                                     int(yes), int(no), int(other))
            else:
                date = self.last_date
                # nothing to do if its not a vote
                if "Floor Sponsor" in line:
                    self.in_vote = False
                    if self.vote:
                        bill.add_vote(self.vote)
                        self.vote = None

                if not self.in_vote:
                    continue
                if 'AYES --' in line:
                    self.flag(ayes=True)
                elif 'NAYS --' in line:
                    self.flag(nays=True)
                elif 'Absent and excused' in line:
                    self.flag(other=True)

                if self.ayes:
                    for name in line.replace('AYES --', '').split(','):
                        name = name.split('(')[0].strip()
                        if name:
                            self.vote.yes(name)

                if self.nays:
                    for name in line.replace('NAYS --', '').split(','):
                        name = name.split('(')[0].strip()
                        if name:
                            self.vote.no(name)

                if self.other:
                    for name in line.replace('Absent and excused --', '').split(','):
                        name = name.split('(')[0].strip()
                        if name:
                            self.vote.other(name)

        self.save_bill(bill)
Esempio n. 50
0
    def scrape_bill(self, chamber, session, bill_id):
        # try and get bill for current year
        url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % (
            session[:4], bill_id.replace(' ', '-'))
        html = self.urlopen(url)
        # if first page isn't found, try second year
        if 'Page Not Found' in html:
            html = self.urlopen('http://legislature.mi.gov/doc.aspx?%s-%s' %
                                (session[-4:], bill_id.replace(' ', '-')))
            if 'Page Not Found' in html:
                return None

        doc = lxml.html.fromstring(html)

        title = doc.xpath(
            '//span[@id="frg_billstatus_ObjectSubject"]')[0].text_content()

        # get B/R/JR/CR part and look up bill type
        bill_type = bill_types[bill_id.split(' ')[0][1:]]

        bill = Bill(session=session,
                    chamber=chamber,
                    bill_id=bill_id,
                    title=title,
                    type=bill_type)
        bill.add_source(url)

        # sponsors
        sp_type = 'primary'
        for sponsor in doc.xpath(
                '//span[@id="frg_billstatus_SponsorList"]/a/text()'):
            sponsor = sponsor.replace(u'\xa0', ' ')
            bill.add_sponsor(sp_type, sponsor)
            sp_type = 'cosponsor'

        bill['subjects'] = doc.xpath(
            '//span[@id="frg_billstatus_CategoryList"]/a/text()')

        # actions (skip header)
        for row in doc.xpath(
                '//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]:
            tds = row.xpath('td')  # date, journal link, action
            date = tds[0].text_content()
            journal = tds[1].text_content()
            action = tds[2].text_content()
            date = datetime.datetime.strptime(date, "%m/%d/%Y")
            # instead of trusting upper/lower case, use journal for actor
            actor = 'upper' if 'SJ' in journal else 'lower'
            type = categorize_action(action)
            bill.add_action(actor, action, date, type=type)

            # check if action mentions a vote
            rcmatch = re.search('Roll Call # (\d+)', action, re.IGNORECASE)
            if rcmatch:
                rc_num = rcmatch.groups()[0]
                # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011
                journal_link = tds[1].xpath('a/@href')
                if journal_link:
                    objectname = journal_link[0].rsplit('=', 1)[-1]
                    chamber_name = {'upper': 'Senate', 'lower': 'House'}[actor]
                    vote_url = BASE_URL + '/documents/%s/Journal/%s/htm/%s.htm' % (
                        session, chamber_name, objectname)
                    vote = Vote(actor, date, action, False, 0, 0, 0)
                    self.parse_roll_call(vote, vote_url, rc_num)

                    # check the expected counts vs actual
                    count = re.search('YEAS (\d+)', action, re.IGNORECASE)
                    count = int(count.groups()[0]) if count else 0
                    if count != len(vote['yes_votes']):
                        self.warning(
                            'vote count mismatch for %s %s, %d != %d' %
                            (bill_id, action, count, len(vote['yes_votes'])))
                    count = re.search('NAYS (\d+)', action, re.IGNORECASE)
                    count = int(count.groups()[0]) if count else 0
                    if count != len(vote['no_votes']):
                        self.warning(
                            'vote count mismatch for %s %s, %d != %d' %
                            (bill_id, action, count, len(vote['no_votes'])))

                    vote['yes_count'] = len(vote['yes_votes'])
                    vote['no_count'] = len(vote['no_votes'])
                    vote['other_count'] = len(vote['other_votes'])
                    vote['passed'] = vote['yes_count'] > vote['no_count']
                    vote.add_source(vote_url)
                    bill.add_vote(vote)
                else:
                    self.warning("missing journal link for %s %s" %
                                 (bill_id, journal))

        # versions
        for row in doc.xpath(
                '//table[@id="frg_billstatus_DocumentGridTable"]/tr'):
            version = self.parse_doc_row(row)
            if version:
                bill.add_version(*version)

        # documents
        for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                bill.add_document(*document)
        for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                bill.add_document(*document)

        self.save_bill(bill)
        return True
Esempio n. 51
0
    def scrape_bill(self, chamber, session, bill_id):
        # try and get bill for current year
        url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % (
            session[:4], bill_id.replace(' ', '-'))
        html = self.get(url).text
        # if first page isn't found, try second year
        if ('Page Not Found' in html
                or 'The bill you are looking for is not available yet' in html):
            html = self.get('http://legislature.mi.gov/doc.aspx?%s-%s'
                            % (session[-4:], bill_id.replace(' ','-'))).text
            if ('Page Not Found' in html
                or 'The bill you are looking for is not available yet' in html):
                return None

        doc = lxml.html.fromstring(html)

        title = doc.xpath('//span[@id="frg_billstatus_ObjectSubject"]')[0].text_content()

        # get B/R/JR/CR part and look up bill type
        bill_type = bill_types[bill_id.split(' ')[0][1:]]

        bill = Bill(session=session, chamber=chamber, bill_id=bill_id,
                    title=title, type=bill_type)
        bill.add_source(url)

        # sponsors
        sp_type = 'primary'
        for sponsor in doc.xpath('//span[@id="frg_billstatus_SponsorList"]/a/text()'):
            sponsor = sponsor.replace(u'\xa0', ' ')
            bill.add_sponsor(sp_type, sponsor)
            sp_type = 'cosponsor'

        bill['subjects'] = doc.xpath('//span[@id="frg_billstatus_CategoryList"]/a/text()')

        # actions (skip header)
        for row in doc.xpath('//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]:
            tds = row.xpath('td')  # date, journal link, action
            date = tds[0].text_content()
            journal = tds[1].text_content()
            action = tds[2].text_content()
            date = datetime.datetime.strptime(date, "%m/%d/%Y")
            # instead of trusting upper/lower case, use journal for actor
            actor = 'upper' if 'SJ' in journal else 'lower'
            type = categorize_action(action)
            bill.add_action(actor, action, date, type=type)

            # check if action mentions a vote
            rcmatch = re.search('Roll Call # (\d+)', action, re.IGNORECASE)
            if rcmatch:
                rc_num = rcmatch.groups()[0]
                # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011
                journal_link = tds[1].xpath('a/@href')
                if journal_link:
                    objectname = journal_link[0].rsplit('=', 1)[-1]
                    chamber_name = {'upper': 'Senate', 'lower': 'House'}[actor]
                    vote_url = BASE_URL + '/documents/%s/Journal/%s/htm/%s.htm' % (
                        session, chamber_name, objectname)
                    vote = Vote(actor, date, action, False, 0, 0, 0)
                    self.parse_roll_call(vote, vote_url, rc_num)

                    # check the expected counts vs actual
                    count = re.search('YEAS (\d+)', action, re.IGNORECASE)
                    count = int(count.groups()[0]) if count else 0
                    if count != len(vote['yes_votes']):
                        self.warning('vote count mismatch for %s %s, %d != %d' % 
                                     (bill_id, action, count, len(vote['yes_votes'])))
                    count = re.search('NAYS (\d+)', action, re.IGNORECASE)
                    count = int(count.groups()[0]) if count else 0
                    if count != len(vote['no_votes']):
                        self.warning('vote count mismatch for %s %s, %d != %d' % 
                                     (bill_id, action, count, len(vote['no_votes'])))

                    vote['yes_count'] = len(vote['yes_votes'])
                    vote['no_count'] = len(vote['no_votes'])
                    vote['other_count'] = len(vote['other_votes'])
                    vote['passed'] = vote['yes_count'] > vote['no_count']
                    vote.add_source(vote_url)
                    bill.add_vote(vote)
                else:
                    self.warning("missing journal link for %s %s" % 
                                 (bill_id, journal))

        # versions
        for row in doc.xpath('//table[@id="frg_billstatus_DocumentGridTable"]/tr'):
            version = self.parse_doc_row(row)
            if version:
                if version[1].endswith('.pdf'):
                    mimetype = 'application/pdf'
                elif version[1].endswith('.htm'):
                    mimetype = 'text/html'
                bill.add_version(*version, mimetype=mimetype)

        # documents
        for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                bill.add_document(*document)
        for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                bill.add_document(*document)

        self.save_bill(bill)
        return True
Esempio n. 52
0
    def _scrape_bill(self, session, bill_data):
        details = self._parse_bill_details(bill_data)

        (senate_url, assembly_url, bill_chamber, bill_type, bill_id,
         title, (prefix, number, active_version)) = details

        bill = Bill(
            session,
            bill_chamber,
            bill_id,
            title,
            type=bill_type,
            summary=bill_data['summary'])

        if bill_data['title'] is None:
            bill['title'] = bill_data['summary']

        bill_active_version = bill_data['amendments']['items'][active_version]

        # Parse sponsors.
        if bill_data['sponsor']['rules'] == True:
            bill.add_sponsor('primary', 'Rules Committee',
                chamber=bill_chamber)
        elif not bill_data['sponsor']['budget']:
            primary_sponsor = bill_data['sponsor']['member']
            bill.add_sponsor('primary', primary_sponsor['shortName'])

            # There *shouldn't* be cosponsors if there is no sponsor.
            cosponsors = bill_active_version['coSponsors']['items']
            for cosponsor in cosponsors:
                bill.add_sponsor('cosponsor', cosponsor['shortName'])

        # List companion bill.
        same_as = bill_active_version.get('sameAs', {})
        # Check whether "sameAs" property is populated with at least one bill.
        if same_as['items']:
            # Get companion bill ID.
            companion_bill_id = same_as['items'][0]['basePrintNo']

            # Build companion bill session.
            start_year = same_as['items'][0]['session']
            end_year = start_year + 1
            companion_bill_session = '-'.join([str(start_year), str(end_year)])

            # Determine companion bill chamber.
            companion_bill_prefix = self._parse_bill_number(
                same_as['items'][0]['basePrintNo'])[0]
            companion_bill_chamber = self._parse_bill_prefix(
                companion_bill_prefix)[0]

            # Attach companion bill data.
            bill.add_companion(
                companion_bill_id,
                companion_bill_session,
                companion_bill_chamber,
            )

        # Parse actions.
        chamber_map = {
            'senate': 'upper',
            'assembly': 'lower',
        }

        for action in bill_data['actions']['items']:
            chamber = chamber_map[action['chamber'].lower()]
            action_datetime = datetime.datetime.strptime(action['date'],
                '%Y-%m-%d')
            action_date = action_datetime.date()
            types, attrs = NYBillScraper.categorizer.categorize(action['text'])

            bill.add_action(
                chamber,
                action['text'],
                action_date,
                type=types,
                **attrs)

        # Chamber-specific processing.
        if bill_chamber == 'upper':
            # Collect votes.
            for vote_data in bill_data['votes']['items']:
                vote = self._parse_senate_votes(vote_data)
                bill.add_vote(vote)
        elif bill_chamber == 'lower':
            assembly = AssemblyBillPage(self, session, bill, details)
            assembly.build()
            assembly_bill_data = assembly.bill

        # A little strange the way it works out, but the Assembly
        # provides the HTML version documents and the Senate provides
        # the PDF version documents.
        amendments = bill_data['amendments']['items']
        for key, amendment in amendments.iteritems():
            version = amendment['printNo']

            html_version = version + ' HTML'
            html_url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn='\
                '{}&term={}'.format(bill_id, self.term_start_year)
            bill.add_version(html_version, html_version, mimetype='text/html')

            pdf_version = version + ' PDF'
            pdf_url = 'http://legislation.nysenate.gov/pdf/bills/{}/{}'\
                .format(self.term_start_year, bill_id)
            bill.add_version(pdf_version, pdf_version,
                mimetype='application/pdf')

        # Handling of sources follows. Sources serving either chamber
        # maintain duplicate data, so we can see certain bill data
        # through either chamber's resources. However, we have to refer
        # to a specific chamber's resources if we want to grab certain
        # specific information such as vote data.
        #
        # As such, I'm placing all potential sources in the interest of
        # thoroughness. - Andy Lo

        # List Open Legislation API endpoint as a source.
        bill.add_source(self.api_client.root + self.api_client.\
            resources['bill'].format(
                session_year=session,
                bill_id=bill_id,
                summary='',
                detail=''))
        bill.add_source(senate_url)
        bill.add_source(assembly_url)

        return bill
Esempio n. 53
0
    def scrape_bill(self, url, kw,
                    re_amendment=re.compile(r'(^[A-Z]A \d{1,3}) to'),
                    re_substitution=re.compile(r'(^[A-Z]S \d{1,2}) for'),
                    re_digits=re.compile(r'\d{,5}'),
                    actions_get_actor=actions.get_actor):

        bill = Bill(**kw)
        bill.add_source(url)

        #---------------------------------------------------------------------
        # A few helpers.
        _url_2_lxml = self._url_2_lxml
        _cleanup_sponsors = self._cleanup_sponsors

        # Shortcut function partial to get text at a particular xpath:
        doc = _url_2_lxml(url)
        _get_text = partial(get_text, doc, 0)

        # Get session number--needed for fetching related documents (see below).
        xpath = '//font[contains(., "General Assembly") and @face="Arial"]'
        session_num = doc.xpath(xpath)[0].text_content()
        session_num = re_digits.match(session_num).group()

        #---------------------------------------------------------------------
        # Sponsors
        chamber = bill['chamber']

        sponsor_types = {
            'Additional Sponsor(s):': 'cosponsor',
            'CoSponsors:': 'cosponsor',
            'Primary Sponsor:': 'primary'}

        xpath = '//font[contains(., "Sponsor") and @color="#008080"]'
        headings = doc.xpath(xpath + '/text()')
        sponsors = doc.xpath(xpath + '/../../following-sibling::td/font/text()')

        for h, s in zip(headings, sponsors):

            names = _cleanup_sponsors(s, chamber)
            type_ = sponsor_types[h.strip()]

            if names:
                for name, _chamber in names:
                    bill.add_sponsor(type_, name, chamber=_chamber)

        #---------------------------------------------------------------------
        # Versions

        tmp = '/'.join([
            'http://www.legis.delaware.gov',
            'LIS/lis{session_num}.nsf/vwLegislation',
            '{moniker}/$file/{filename}{format_}?open'])

        documents = self.scrape_documents(source=url,
                                     docname="introduced",
                                     filename="Legis",
                                     tmp=tmp,
                                     session_num=session_num)

        for d in documents:
            bill.add_version(**d)

        # If bill is a substitution, add the original as a version.
        names = doc.xpath('//*[contains(text(), "Substituted '
                          'Legislation for Bill:")]/text()')
        urls = doc.xpath('//*[contains(text(), "Substituted '
                          'Legislation for Bill:")]'
                         '/following-sibling::a/@href')

        for name, url in zip(names, urls):

            name = re_substitution.match(name).group(1)
            bill.add_version(name, url,
                             description='original bill')

        #---------------------------------------------------------------------
        # Actions
        actions = doc.xpath('//font[contains(., "Actions History")]'
                            '/../following-sibling::table/descendant::td[2]')
        actions = actions[0].text_content()
        actions = filter(None, actions.splitlines())

        for a in reversed(actions):
            date, action = a.split(' - ', 1)
            try:
                date = datetime.strptime(date, '%b %d, %Y')
            except ValueError:
                date = datetime.strptime(date, '%B %d, %Y')  # XXX: ugh.

            actor = actions_get_actor(action, bill['chamber'])
            attrs = dict(actor=actor, action=action, date=date)
            attrs.update(**self.categorizer.categorize(action))
            bill.add_action(**attrs)

        #---------------------------------------------------------------------
        # Votes
        vote_strings = doc.xpath('//*[contains(text(), "vote:")]/text()')

        # Sometimes vote strings are contained in weird, separate elements. Probably
        # hand edited.
        if not all(re.search('\d', string) for string in vote_strings):
            # Use the parent's text_content instead.
            vote_strings = []
            for el in doc.xpath('//*[contains(text(), "vote:")]/..'):
                vote_strings.append(el.text_content())

        vote_urls = doc.xpath('//*[contains(text(), "vote:")]'
                              '/following-sibling::a/@href')
        for string, url in zip(vote_strings, vote_urls):

            vote_data = parse_votestring(string)
            vote = self.scrape_vote(url, **vote_data)
            if vote:
                bill.add_vote(vote)

        #---------------------------------------------------------------------
        # Amendments
        xpath = ("//font[contains(., 'Amendments')]/"
                 "../../../td[2]/font/a")

        tmp = ('http://www.legis.delaware.gov/LIS/lis{session_num}.nsf/'
               'vwLegislation/{id_}/$file/{filename}{format_}?open')

        for source, id_ in zip(doc.xpath(xpath + '/@href'),
                               doc.xpath(xpath + '/text()')):

            short_id = re_amendment.match(id_).group(1)

            documents = self.scrape_documents(
                source=source,
                docname='amendment (%s)' % short_id,
                filename='Legis',
                tmp=tmp, session_num=session_num,
                id_=id_)

            for d in documents:
                bill.add_document(**d)

        #---------------------------------------------------------------------
        # Add any related "Engrossments".
        # See www.ncsl.org/documents/legismgt/ILP/98Tab3Pt4.pdf for
        # an explanation of the engrossment process in DE.
        source = doc.xpath('//img[@alt="Engrossment"]/../@href')

        if source:

            tmp = '/'.join([
                'http://www.legis.delaware.gov',
                'LIS/lis{session_num}.nsf/EngrossmentsforLookup',
                '{moniker}/$file/{filename}{format_}?open'])

            documents = self.scrape_documents(
                source=source[0],
                docname="Engrossment",
                filename="Engross",
                tmp=tmp,
                session_num=session_num,
                id_=bill['bill_id'])

            for d in documents:
                bill.add_version(**d)

        # --------------------------------------------------------------------
        # Add any fiscal notes.
        source = doc.xpath("//img[@alt='Fiscal Note']/../@href")

        if source:

            tmp = '/'.join([
                'http://www.legis.delaware.gov',
                'LIS/lis{session_num}.nsf/FiscalforLookup',
                '{docnum}/$file/{filename}{format_}?open'])

            documents = self.scrape_documents(
                source=source[0],
                docname="Fiscal Note",
                filename="Fiscal",
                tmp=tmp,
                session_num=session_num)

            for d in documents:
                bill.add_document(**d)

        #---------------------------------------------------------------------
        # Extra fields

        # Helper to get the first td sibling of certain nodes.
        tmp = '//font[contains(., "%s")]/../../../td[2]'
        first_sibling_text = lambda heading: _get_text(tmp % heading)

        extra_fields = {
            # A long description of the legislation.
            "summary": "Synopsis",
            # Codification details for enacted legislation.
            "volume_chapter": "Volume Chapter",
            # Presumably the date of approval/veto.
            "date_governor_acted": "Date Governor Acted",
            "fiscal_notes": "Fiscal Notes",
        }

        for key, name in extra_fields.iteritems():
            try:
                bill[key] = first_sibling_text(name)
            except IndexError:
                # xpath lookup failed.
                pass

        self.save_bill(bill)
Esempio n. 54
0
    def scrape_bill(self, chamber, bill):
        bill_id = bill['id'].replace('w/', 'with ')

        page = lxml.html.fromstring(self.urlopen(bill['url']))
        page.make_links_absolute(bill['url'])

        title_row = page.xpath('//tr[td/b[contains(font,"Long Title")]]')[0]
        # text_content() == make sure any tags in the title don't cause issues
        title = title_row.xpath('td[@width="79%"]/font')[0].text_content()

        # now we can create a bill object
        b = Bill(bill['session'], bill['chamber'], bill_id, title)
        b.add_source(bill['url'])

        sponsors_row = page.xpath(
            '//tr[td/b[contains(font,"Primary Sponsor")]]')[0]
        sponsor = sponsors_row.xpath('td[@width="31%"]/font')[0].text
        b.add_sponsor('primary', sponsor)

        # scraping these and co-sponsors, but not doing anything with them until
        # it's decided whether or not to attempt to split 'em up
        additional = sponsors_row.xpath('td[@width="48%"]/font')
        additional_sponsors = additional[0].text if len(additional) > 0 else ""
        additional_sponsors = additional_sponsors.replace(
            '&nbsp&nbsp&nbsp', '')

        cosponsors_row = page.xpath(
            '//tr[td/b[contains(font,"CoSponsors")]]')[0]
        cosponsors = cosponsors_row.xpath('td[@width="79%"]/font')[0].text
        cosponsors = cosponsors if cosponsors != '{ NONE...}' else ''

        introduced_row = page.xpath(
            '//tr[td/b[contains(font,"Introduced On")]]')
        if len(introduced_row) > 0:
            introduced = introduced_row[0].expath(
                '/td[@width="31%"]/font')[0].text
            introduced = datetime.strptime(introduced, '%b %d, %Y')
            b.add_action(bill['chamber'], 'introduced', introduced,
                         'bill:introduced')

        actions = page.xpath(
            '//table[preceding-sibling::b[contains(font,"Actions History:")]]/tr/td[@width="79%"]/font'
        )
        if len(actions) > 0:
            actions = actions[0].text_content().split('\n')
            for act in actions:
                act = act.partition(' - ')
                date = datetime.strptime(act[0], '%b %d, %Y')
                b.add_action(bill['chamber'], act[2], date)

        # resources = page.xpath('//tr[td/b[contains(font, "Full text of Legislation")]]')

        # save vote urls for scraping later
        vote_urls = []
        voting_reports = page.xpath(
            '//tr[td/b[contains(font, "Voting Reports")]]')
        if (len(voting_reports) > 0):
            for report in voting_reports[0].xpath('td/font/a'):
                vote_urls.append(report.attrib['href'])

        # Scrape votes
        for url in vote_urls:
            vote = self.scrape_votes(chamber, title, bill_id, url)
            b.add_vote(vote)

        # Save bill
        self.save_bill(b)
Esempio n. 55
0
    def scrape_bill(self,
                    url,
                    kw,
                    re_amendment=re.compile(r'(^[A-Z]A \d{1,3}) to'),
                    re_substitution=re.compile(r'(^[A-Z]S \d{1,2}) for'),
                    re_digits=re.compile(r'\d{,5}'),
                    actions_categorize=actions.categorize,
                    actions_get_actor=actions.get_actor):

        bill = Bill(**kw)
        bill.add_source(url)

        #---------------------------------------------------------------------
        # A few helpers.
        _url_2_lxml = self._url_2_lxml
        _cleanup_sponsors = self._cleanup_sponsors

        # Shortcut function partial to get text at a particular xpath:
        doc = _url_2_lxml(url)
        _get_text = partial(get_text, doc, 0)

        # Get session number--needed for fetching related documents (see below).
        xpath = '//font[contains(., "General Assembly") and @face="Arial"]'
        session_num = doc.xpath(xpath)[0].text_content()
        session_num = re_digits.match(session_num).group()

        #---------------------------------------------------------------------
        # Sponsors
        chamber = bill['chamber']

        sponsor_types = {
            'Additional Sponsor(s):': 'cosponsor',
            'CoSponsors:': 'cosponsor',
            'Primary Sponsor:': 'primary'
        }

        xpath = '//font[contains(., "Sponsor") and @color="#008080"]'
        headings = doc.xpath(xpath + '/text()')
        sponsors = doc.xpath(xpath +
                             '/../../following-sibling::td/font/text()')

        for h, s in zip(headings, sponsors):

            names = _cleanup_sponsors(s, chamber)
            type_ = sponsor_types[h.strip()]

            if names:
                for name, _chamber in names:
                    bill.add_sponsor(type_, name, chamber=_chamber)

        #---------------------------------------------------------------------
        # Versions

        tmp = '/'.join([
            'http://www.legis.delaware.gov',
            'LIS/lis{session_num}.nsf/vwLegislation',
            '{moniker}/$file/{filename}{format_}?open'
        ])

        documents = self.scrape_documents(source=url,
                                          docname="introduced",
                                          filename="Legis",
                                          tmp=tmp,
                                          session_num=session_num)

        for d in documents:
            bill.add_version(**d)

        # If bill is a substitution, add the original as a version.
        names = doc.xpath('//*[contains(text(), "Substituted '
                          'Legislation for Bill:")]/text()')
        urls = doc.xpath('//*[contains(text(), "Substituted '
                         'Legislation for Bill:")]'
                         '/following-sibling::a/@href')

        for name, url in zip(names, urls):

            name = re_substitution.match(name).group(1)
            bill.add_version(name, url, description='original bill')

        #---------------------------------------------------------------------
        # Actions
        actions = doc.xpath('//font[contains(., "Actions History")]'
                            '/../following-sibling::table/descendant::td[2]')
        actions = actions[0].text_content()
        actions = filter(None, actions.splitlines())

        for a in reversed(actions):
            date, action = a.split(' - ', 1)
            try:
                date = datetime.strptime(date, '%b %d, %Y')
            except ValueError:
                date = datetime.strptime(date, '%B %d, %Y')  # XXX: ugh.

            actor = actions_get_actor(action, bill['chamber'])
            type_ = actions_categorize(action)
            bill.add_action(actor, action, date, type_)

        #---------------------------------------------------------------------
        # Votes
        vote_strings = doc.xpath('//*[contains(text(), "vote:")]/text()')

        # Sometimes vote strings are contained in weird, separate elements. Probably
        # hand edited.
        if not all(re.search('\d', string) for string in vote_strings):
            # Use the parent's text_content instead.
            vote_strings = []
            for el in doc.xpath('//*[contains(text(), "vote:")]/..'):
                vote_strings.append(el.text_content())

        vote_urls = doc.xpath('//*[contains(text(), "vote:")]'
                              '/following-sibling::a/@href')
        for string, url in zip(vote_strings, vote_urls):

            vote_data = parse_votestring(string)
            vote = self.scrape_vote(url, **vote_data)
            if vote:
                bill.add_vote(vote)

        #---------------------------------------------------------------------
        # Amendments
        xpath = ("//font[contains(., 'Amendments')]/" "../../../td[2]/font/a")

        tmp = ('http://www.legis.delaware.gov/LIS/lis{session_num}.nsf/'
               'vwLegislation/{id_}/$file/{filename}{format_}?open')

        for source, id_ in zip(doc.xpath(xpath + '/@href'),
                               doc.xpath(xpath + '/text()')):

            short_id = re_amendment.match(id_).group(1)

            documents = self.scrape_documents(source=source,
                                              docname='amendment (%s)' %
                                              short_id,
                                              filename='Legis',
                                              tmp=tmp,
                                              session_num=session_num,
                                              id_=id_)

            for d in documents:
                bill.add_document(**d)

        #---------------------------------------------------------------------
        # Add any related "Engrossments".
        # See www.ncsl.org/documents/legismgt/ILP/98Tab3Pt4.pdf for
        # an explanation of the engrossment process in DE.
        source = doc.xpath('//img[@alt="Engrossment"]/../@href')

        if source:

            tmp = '/'.join([
                'http://www.legis.delaware.gov',
                'LIS/lis{session_num}.nsf/EngrossmentsforLookup',
                '{moniker}/$file/{filename}{format_}?open'
            ])

            documents = self.scrape_documents(source=source[0],
                                              docname="Engrossment",
                                              filename="Engross",
                                              tmp=tmp,
                                              session_num=session_num,
                                              id_=bill['bill_id'])

            for d in documents:
                bill.add_version(**d)

        # --------------------------------------------------------------------
        # Add any fiscal notes.
        source = doc.xpath("//img[@alt='Fiscal Note']/../@href")

        if source:

            tmp = '/'.join([
                'http://www.legis.delaware.gov',
                'LIS/lis{session_num}.nsf/FiscalforLookup',
                '{docnum}/$file/{filename}{format_}?open'
            ])

            documents = self.scrape_documents(source=source[0],
                                              docname="Fiscal Note",
                                              filename="Fiscal",
                                              tmp=tmp,
                                              session_num=session_num)

            for d in documents:
                bill.add_document(**d)

        #---------------------------------------------------------------------
        # Extra fields

        # Helper to get the first td sibling of certain nodes.
        tmp = '//font[contains(., "%s")]/../../../td[2]'
        first_sibling_text = lambda heading: _get_text(tmp % heading)

        extra_fields = {
            # A long description of the legislation.
            "summary": "Synopsis",
            # Codification details for enacted legislation.
            "volume_chapter": "Volume Chapter",
            # Presumably the date of approval/veto.
            "date_governor_acted": "Date Governor Acted",
            "fiscal_notes": "Fiscal Notes",
        }

        for key, name in extra_fields.iteritems():
            try:
                bill[key] = first_sibling_text(name)
            except IndexError:
                # xpath lookup failed.
                pass

        self.save_bill(bill)
Esempio n. 56
0
    def scrape_bill_type(self, chamber, session, bill_type, type_abbr):
        if chamber == 'upper':
            chamber_name = 'SENATE'
        else:
            chamber_name = 'ASSEMBLY'

        bills = self.session.query(CABill).filter_by(
            session_year=session).filter_by(measure_type=type_abbr)

        for bill in bills:
            bill_session = session
            if bill.session_num != '0':
                bill_session += ' Special Session %s' % bill.session_num

            bill_id = bill.short_bill_id

            fsbill = Bill(bill_session, chamber, bill_id, '')

            # # Construct session for web query, going from '20092010' to '0910'
            # source_session = session[2:4] + session[6:8]

            # # Turn 'AB 10' into 'ab_10'
            # source_num = "%s_%s" % (bill.measure_type.lower(),
            #                         bill.measure_num)

            # Construct a fake source url
            source_url = ('http://leginfo.legislature.ca.gov/faces/'
                          'billNavClient.xhtml?bill_id=%s') % bill.bill_id

            fsbill.add_source(source_url)
            fsbill.add_version(bill_id, source_url, 'text/html')

            title = ''
            short_title = ''
            type = ['bill']
            subject = ''
            all_titles = set()
            for version in bill.versions:
                if not version.bill_xml:
                    continue

                title = clean_title(version.title)
                if title:
                    all_titles.add(title)
                short_title = clean_title(version.short_title)
                type = [bill_type]

                if version.appropriation == 'Yes':
                    type.append('appropriation')
                if version.fiscal_committee == 'Yes':
                    type.append('fiscal committee')
                if version.local_program == 'Yes':
                    type.append('local program')
                if version.urgency == 'Yes':
                    type.append('urgency')
                if version.taxlevy == 'Yes':
                    type.append('tax levy')

                if version.subject:
                    subject = clean_title(version.subject)

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill['title'] = title
            fsbill['short_title'] = short_title
            fsbill['type'] = type
            fsbill['subjects'] = filter(None, [subject])

            # We don't want the current title in alternate_titles
            all_titles.remove(title)

            fsbill['alternate_titles'] = list(all_titles)

            for author in version.authors:
                if author.house == chamber_name:
                    fsbill.add_sponsor(author.contribution, author.name)

            introduced = False
            committee_code_regex = self.committee_code_regex()
            committee_slug_regex = self.committee_slug_regex()

            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r'(Assembly|Senate)($| \(Floor)', actor)
                if match:
                    actor = {
                        'Assembly': 'lower',
                        'Senate': 'upper'
                    }[match.group(1)]
                elif actor.startswith('Governor'):
                    actor = 'executive'
                else:
                    actor = re.sub('^Assembly', 'lower', actor)
                    actor = re.sub('^Senate', 'upper', actor)

                type = []

                act_str = action.action
                act_str = re.sub(r'\s+', ' ', act_str)

                if act_str.startswith('Introduced'):
                    introduced = True
                    type.append('bill:introduced')

                if 'Read first time.' in act_str:
                    if not introduced:
                        type.append('bill:introduced')
                        introduced = True
                    type.append('bill:reading:1')

                if 'To Com' in act_str or 'referred to' in act_str.lower():
                    type.append('committee:referred')

                if 'Read third time.  Passed' in act_str:
                    type.append('bill:passed')

                if 'Read third time. Passed' in act_str:
                    type.append('bill:passed')

                if 'Read third time, passed' in act_str:
                    type.append('bill:passed')

                if re.search(r'Read third time.+?Passed and', act_str):
                    type.append('bill:passed')

                if 'Approved by Governor' in act_str:
                    type.append('governor:signed')

                if 'Item veto' in act_str:
                    type.append('governor:vetoed:line-item')

                if 'Vetoed by Governor' in act_str:
                    type.append('governor:vetoed')

                if 'To Governor' in act_str:
                    type.append('governor:received')

                if 'Read second time' in act_str:
                    type.append('bill:reading:2')

                if not type:
                    type = ['other']

                # Add in the committee ID of the related committee, if any.
                kwargs = {}
                code = re.search(committee_code_regex, actor, re.I)
                if code:
                    code = code.group()
                    committee_id = self.committee_code_to_id(code)
                    if committee_id:
                        kwargs['actor_id'] = committee_id
                        kwargs['actor_collection'] = 'committees'
                        actor_text = re.search(committee_slug_regex,
                                               action.action)
                        if actor_text:
                            actor_text = actor_text.group()
                            kwargs['actor_text'] = actor_text
                        else:
                            kwargs['actor_text'] = 'committee'

                fsbill.add_action(actor,
                                  act_str,
                                  action.action_date.date(),
                                  type=type,
                                  **kwargs)

            for vote in bill.votes:
                if vote.vote_result == '(PASS)':
                    result = True
                else:
                    result = False

                full_loc = vote.location.description
                first_part = full_loc.split(' ')[0].lower()
                if first_part in ['asm', 'assembly']:
                    vote_chamber = 'lower'
                    vote_location = ' '.join(full_loc.split(' ')[1:])
                elif first_part.startswith('sen'):
                    vote_chamber = 'upper'
                    vote_location = ' '.join(full_loc.split(' ')[1:])
                else:
                    raise ScrapeError("Bad location: %s" % full_loc)

                motion = vote.motion.motion_text or ''

                if "Third Reading" in motion or "3rd Reading" in motion:
                    vtype = 'passage'
                elif "Do Pass" in motion:
                    vtype = 'passage'
                else:
                    vtype = 'other'

                motion = motion.strip()

                # Why did it take until 2.7 to get a flags argument on re.sub?
                motion = re.compile(r'(\w+)( Extraordinary)? Session$',
                                    re.IGNORECASE).sub('', motion)
                motion = re.compile(r'^(Senate|Assembly) ',
                                    re.IGNORECASE).sub('', motion)
                motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ', '',
                                motion)
                motion = re.sub(r' \(\w+\)$', '', motion)
                motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '',
                                motion)
                motion = re.sub(
                    r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? '
                    r'Urgency Clause$', '(Urgency Clause)', motion)
                motion = re.sub(r'\s+', ' ', motion)

                if not motion:
                    self.warning("Got blank motion on vote for %s" % bill_id)
                    continue

                fsvote = Vote(vote_chamber,
                              self._tz.localize(vote.vote_date_time),
                              motion,
                              result,
                              int(vote.ayes),
                              int(vote.noes),
                              int(vote.abstain),
                              threshold=vote.threshold,
                              type=vtype)

                if vote_location != 'Floor':
                    fsvote['committee'] = vote_location

                for record in vote.votes:
                    if record.vote_code == 'AYE':
                        fsvote.yes(record.legislator_name)
                    elif record.vote_code.startswith('NO'):
                        fsvote.no(record.legislator_name)
                    else:
                        fsvote.other(record.legislator_name)

                for s in ('yes', 'no', 'other'):
                    # Kill dupe votes.
                    key = s + '_votes'
                    fsvote[key] = list(set(fsvote[key]))

                # In a small percentage of bills, the integer vote counts
                # are inaccurate, so let's ignore them.
                for k in ('yes', 'no', 'other'):
                    fsvote[k + '_count'] = len(fsvote[k + '_votes'])

                fsbill.add_vote(fsvote)

            self.save_bill(fsbill)