Ejemplo n.º 1
0
    def scrape_bill(self, chamber, session, url):
        html = self.get(url).content
        page = lxml.html.fromstring(html)
        page.make_links_absolute(self.BASE_URL)

        if page.xpath('//h2[@style="font-size:1.3rem;"]/a[1]/text()'):
            bill_id = page.xpath(
                '//h2[@style="font-size:1.3rem;"]/a[1]/text()')[0].strip()
        elif page.xpath('//h2[@style="font-size:1.3rem;"]/text()'):
            bill_id = page.xpath(
                '//h2[@style="font-size:1.3rem;"]/text()')[0].strip()
        else:
            self.warning("No bill id for {}".format(url))
            return
        title = page.xpath(
            '//dt[contains(text(), "Title")]/following-sibling::dd[1]/text()'
        )[0].strip()

        if "B" in bill_id:
            _type = ["bill"]
        elif "J" in bill_id:
            _type = ["joint resolution"]
        elif "HS" in bill_id:
            _type = ["resolution"]
        else:
            raise ValueError("unknown bill type " + bill_id)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=_type,
        )
        bill.add_source(url)

        self.scrape_bill_subjects(bill, page)
        self.scrape_bill_sponsors(bill, page)
        self.scrape_bill_actions(bill, page)

        # fiscal note
        if page.xpath(
                '//dt[contains(text(), "Analysis")]/following-sibling::dd[1]/a'
        ):
            fiscal_note = page.xpath(
                '//dt[contains(text(), "Analysis")]/following-sibling::dd[1]/a'
            )[0]
            fiscal_url = fiscal_note.get("href")
            fiscal_title = fiscal_note.text_content()
            bill.add_document_link(
                fiscal_title,
                fiscal_url,
                media_type="application/pdf",
            )

        # yield from self.parse_bill_votes_new(doc, bill)
        yield bill
Ejemplo n.º 2
0
    def scrape_bill(self, chamber, session, bill_id):
        bill_num = bill_id.split()[1]

        url = ("%s/GetLegislation?biennium=%s&billNumber"
               "=%s" % (self._base_url, self.biennium, bill_num))

        page = self.get(url)
        page = lxml.etree.fromstring(page.content)
        page = xpath(page, "//wa:Legislation")[0]

        xml_chamber = xpath(page, 'string(wa:OriginalAgency)')
        chamber = self._chamber_map[xml_chamber]

        title = xpath(page, "string(wa:LongDescription)")

        bill_type = xpath(
            page, "string(wa:ShortLegislationType/wa:LongLegislationType)")
        bill_type = bill_type.lower()

        if bill_type == 'gubernatorial appointment':
            return

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=[bill_type])
        fake_source = ("http://apps.leg.wa.gov/billinfo/"
                       "summary.aspx?bill=%s&year=%s" %
                       (bill_num, session[0:4]))

        bill.add_source(fake_source)

        try:
            for version in self.versions[bill_id]:
                bill.add_version_link(note=version['note'],
                                      url=version['url'],
                                      media_type=version['media_type'])
        except KeyError:
            self.warning("No versions were found for {}".format(bill_id))

        try:
            for document in self.documents[bill_num]:
                bill.add_document_link(note=document['note'],
                                       url=document['url'],
                                       media_type=document['media_type'])
        except KeyError:
            pass

        self.scrape_sponsors(bill)
        self.scrape_actions(bill, bill_num)
        self.scrape_hearings(bill, bill_num)
        yield from self.scrape_votes(bill)
        bill.subject = list(set(self._subjects[bill_id]))
        yield bill
Ejemplo n.º 3
0
    def scrape_bill(self, chamber, session, bill_id):
        bill_num = bill_id.split()[1]

        url = ("%s/GetLegislation?biennium=%s&billNumber"
               "=%s" % (self._base_url, self.biennium, bill_num))

        page = self.get(url)
        page = lxml.etree.fromstring(page.content)
        page = xpath(page, "//wa:Legislation")[0]

        title = xpath(page, "string(wa:LongDescription)")

        bill_type = xpath(
            page,
            "string(wa:ShortLegislationType/wa:LongLegislationType)")
        bill_type = bill_type.lower()

        if bill_type == 'gubernatorial appointment':
            return

        bill = Bill(bill_id, legislative_session=session, chamber=chamber,
                    title=title, classification=[bill_type])
        fake_source = ("http://apps.leg.wa.gov/billinfo/"
                       "summary.aspx?bill=%s&year=%s" % (
                           bill_num, session[0:4]))

        bill.add_source(fake_source)

        try:
            for version in self.versions[bill_id]:
                bill.add_version_link(note=version['note'],
                                      url=version['url'],
                                      media_type=version['media_type'])
        except KeyError:
            self.warning("No versions were found for {}".format(bill_id))

        try:
            for document in self.documents[bill_num]:
                bill.add_document_link(note=document['note'],
                                       url=document['url'],
                                       media_type=document['media_type'])
        except KeyError:
            pass

        self.scrape_sponsors(bill)
        self.scrape_actions(bill, bill_num)
        self.scrape_hearings(bill, bill_num)
        yield from self.scrape_votes(bill)
        bill.subject = list(set(self._subjects[bill_id]))
        yield bill
Ejemplo n.º 4
0
    def scrape_chamber(self, chamber, session):
        chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber]

        url = ("http://legisweb.state.wy.us/%s/billreference/"
               "BillReference.aspx?type=%s" % (session, chamber_abbrev))
        page = self.lxmlize(url)

        for tr in page.xpath(
                "//table[contains(@id,'cphContent_gvBills')]//tr")[1:]:
            bill_id = tr.xpath("string(td[1])").strip()
            title = tr.xpath("string(td[2])").strip()

            if bill_id[0:2] in ['SJ', 'HJ']:
                bill_type = 'joint resolution'
            else:
                bill_type = 'bill'

            bill = Bill(bill_id,
                        legislative_session=session,
                        title=title,
                        chamber=chamber,
                        classification=bill_type)

            yield from self.scrape_digest(bill, chamber)

            # versions
            for a in (tr.xpath('td[8]//a') + tr.xpath('td[11]//a') +
                      tr.xpath('td[12]//a')):
                # skip references to other bills
                if a.text.startswith('See'):
                    continue
                bill.add_version_link(a.text,
                                      a.get('href'),
                                      media_type='application/pdf')

            # documents
            fnote = tr.xpath('td[9]//a')
            if fnote:
                bill.add_document_link('Fiscal Note', fnote[0].get('href'))
            summary = tr.xpath('td[14]//a')
            if summary:
                bill.add_document_link('Summary', summary[0].get('href'))

            bill.add_source(url)
            yield bill
Ejemplo n.º 5
0
    def scrape_chamber(self, chamber, session):
        chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber]

        url = ("http://legisweb.state.wy.us/%s/billreference/"
               "BillReference.aspx?type=%s" % (session, chamber_abbrev))
        page = self.lxmlize(url)

        for tr in page.xpath("//table[contains(@id,'cphContent_gvBills')]//tr")[1:]:
            bill_id = tr.xpath("string(td[1])").strip()
            title = tr.xpath("string(td[2])").strip()

            if bill_id[0:2] in ['SJ', 'HJ']:
                bill_type = 'joint resolution'
            else:
                bill_type = 'bill'

            bill = Bill(bill_id, legislative_session=session, title=title, chamber=chamber,
                        classification=bill_type)

            yield from self.scrape_digest(bill, chamber)

            # versions
            for a in (tr.xpath('td[8]//a') + tr.xpath('td[11]//a') +
                      tr.xpath('td[12]//a')):
                # skip references to other bills
                if a.text.startswith('See'):
                    continue
                bill.add_version_link(a.text, a.get('href'),
                                      media_type='application/pdf')

            # documents
            fnote = tr.xpath('td[9]//a')
            if fnote:
                bill.add_document_link('Fiscal Note', fnote[0].get('href'))
            summary = tr.xpath('td[14]//a')
            if summary:
                bill.add_document_link('Summary', summary[0].get('href'))

            bill.add_source(url)
            yield bill
Ejemplo n.º 6
0
    def scrape(self, window=28) :
        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
        for matter in self.matters(n_days_ago) :
            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)) :
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name":"Board of Directors"})
            
            legistar_web = matter['legistar_url']
            
            legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id) :
                act = bill.add_action(**action)

                if action['description'] == 'Referred' :
                    body_name = matter['MatterBodyName']
                    act.add_related_entity(body_name,
                                           'organization',
                                           entity_id = _make_pseudo_id(name=body_name))

                result, votes = vote
                if result :
                    vote_event = VoteEvent(legislative_session=bill.legislative_session, 
                                           motion_text=action['description'],
                                           organization=action['organization'],
                                           classification=None,
                                           start_date=action['date'],
                                           result=result,
                                           bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes :
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(raw_option,
                                                             raw_option)
                        vote_event.vote(clean_option, 
                                        vote['VotePersonName'].strip())

                    yield vote_event


            for sponsorship in self.sponsorships(matter_id) :
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id) :
                bill.add_subject(topic['MatterIndexName'].strip())

            for relation in self.relations(matter_id):
                try:
                    # Get data (i.e., json) for the related bill. 
                    # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session).
                    # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue.
                    related_bill = self.endpoint('/matters/{0}', relation['MatterRelationMatterId'])
                except scrapelib.HTTPError:
                    continue
                else:
                    date = related_bill['MatterIntroDate']
                    related_bill_session = self.session(self.toTime(date))
                    identifier = related_bill['MatterFile']
                    bill.add_related_bill(identifier=identifier,
                                          legislative_session=related_bill_session,
                                          relation_type='companion')
                    # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104
                    # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'.

            bill.add_version_link('Board Report',
                                  'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'.format(matter_id),
                                   media_type="application/pdf")

            for attachment in self.attachments(matter_id) :
                if attachment['MatterAttachmentName'] :
                    bill.add_document_link(attachment['MatterAttachmentName'],
                                           attachment['MatterAttachmentHyperlink'],
                                           media_type="application/pdf")

            bill.extras = {'local_classification' : matter['MatterTypeName']}

            text = self.text(matter_id)

            if text :
                if text['MatterTextPlain'] :
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf'] :
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '')

            yield bill
Ejemplo n.º 7
0
    def scrape_bills(self, session, year_abr):
        # Main Bill information
        main_bill_csv = self.access_to_csv('MainBill')

        # keep a dictionary of bills (mapping bill_id to Bill obj)
        bill_dict = {}

        for rec in main_bill_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["Synopsis"]
            if bill_type[0] == 'A':
                chamber = "lower"
            else:
                chamber = "upper"

            # some bills have a blank title.. just skip it
            if not title:
                continue

            bill = Bill(
                bill_id,
                title=title,
                chamber=chamber,
                legislative_session=session,
                classification=self._bill_types[bill_type[1:]],
            )
            if rec['IdenticalBillNumber'].strip():
                bill.add_related_bill(
                    rec['IdenticalBillNumber'].split()[0],
                    legislative_session=session,
                    relation_type='companion',
                )

            # TODO: last session info is in there too
            bill_dict[bill_id] = bill

        # Sponsors
        bill_sponsors_csv = self.access_to_csv('BillSpon')

        for rec in bill_sponsors_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in sponsor database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            name = rec["Sponsor"]
            sponsor_type = rec["Type"]
            if sponsor_type == 'P':
                sponsor_type = "primary"
            else:
                sponsor_type = "cosponsor"
            bill.add_sponsorship(name, classification=sponsor_type, entity_type='person',
                                 primary=sponsor_type == 'primary')

        # Documents
        bill_document_csv = self.access_to_csv('BillWP')

        for rec in bill_document_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in document database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            document = rec["Document"]
            document = document.split('\\')
            document = document[-2] + "/" + document[-1]

            # doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document)
            htm_url = 'http://www.njleg.state.nj.us/{}/Bills/{}'.format(
                year_abr,
                document.replace('.DOC', '.HTM'),
            )

            # name document based _doctype
            try:
                doc_name = self._doctypes[rec['DocType']]
            except KeyError:
                raise Exception('unknown doctype %s on %s' %
                                (rec['DocType'], bill_id))
            if rec['Comment']:
                doc_name += ' ' + rec['Comment']

            # Clean HTMX links.
            if htm_url.endswith('HTMX'):
                htm_url = re.sub('X$', '', htm_url)

            if rec['DocType'] in self._version_types:
                if htm_url.endswith('HTM'):
                    mimetype = 'text/html'
                elif htm_url.endswith('wpd'):
                    mimetype = 'application/vnd.wordperfect'
                try:
                    bill.add_version_link(doc_name, htm_url, media_type=mimetype)
                except ValueError:
                    self.warning("Couldn't find a document for bill {}".format(bill_id))
                    pass
            else:
                bill.add_document_link(doc_name, htm_url)

        # Votes
        next_year = int(year_abr) + 1
        vote_info_list = [
            'A%s' % year_abr,
            'A%s' % next_year,
            'S%s' % year_abr,
            'S%s' % next_year,
            'CA%s-%s' % (year_abr, next_year),
            'CS%s-%s' % (year_abr, next_year),
        ]

        for filename in vote_info_list:
            s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename
            try:
                s_vote_zip, resp = self.urlretrieve(s_vote_url)
            except scrapelib.FTPError:
                self.warning('could not find %s' % s_vote_url)
                continue
            zippedfile = zipfile.ZipFile(s_vote_zip)
            for vfile in ["%s.txt" % (filename), "%sEnd.txt" % (filename)]:
                try:
                    vote_file = io.TextIOWrapper(zippedfile.open(vfile, 'rU'))
                except KeyError:
                    #
                    # Right, so, 2011 we have an "End" file with more
                    # vote data than was in the original dump.
                    #
                    self.warning("No such file: %s" % (vfile))
                    continue

                vdict_file = csv.DictReader(vote_file)

                votes = {}
                if filename.startswith('A') or filename.startswith('CA'):
                    chamber = "lower"
                else:
                    chamber = "upper"

                if filename.startswith('C'):
                    vote_file_type = 'committee'
                else:
                    vote_file_type = 'chamber'

                for rec in vdict_file:
                    if vote_file_type == 'chamber':
                        bill_id = rec["Bill"].strip()
                        leg = rec["Full_Name"]

                        date = rec["Session_Date"]
                        action = rec["Action"]
                        leg_vote = rec["Legislator_Vote"]
                        vote_parts = (bill_id, chamber, action)
                    else:
                        bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number'])
                        leg = rec['Name']
                        # drop time portion
                        date = rec['Agenda_Date'].split()[0]
                        # make motion readable
                        action = self._com_vote_motions[rec['BillAction']]
                        # first char (Y/N) use [0:1] to ignore ''
                        leg_vote = rec['LegislatorVote'][0:1]
                        committee = rec['Committee_House']
                        vote_parts = (bill_id, chamber, action, committee)

                    date = datetime.strptime(date, "%m/%d/%Y")
                    vote_id = '_'.join(vote_parts).replace(' ', '_')

                    if bill_id[0] == 'A':
                        b_chamber = "lower"
                    else:
                        b_chamber = "upper"

                    if vote_id not in votes:
                        votes[vote_id] = VoteEvent(
                            start_date=TIMEZONE.localize(date),
                            chamber=chamber,
                            motion_text=action,
                            classification='passage',
                            result=None,
                            bill=bill_id,
                            bill_chamber=b_chamber,
                            legislative_session=session,
                        )
                    if leg_vote == "Y":
                        votes[vote_id].vote('yes', leg)
                    elif leg_vote == "N":
                        votes[vote_id].vote('no', leg)
                    else:
                        votes[vote_id].vote('other', leg)

            # remove temp file
            os.remove(s_vote_zip)

            # Counts yes/no/other votes and saves overall vote
            for vote in votes.values():
                counts = collections.defaultdict(int)
                for count in vote.votes:
                    counts[count['option']] += 1
                vote.set_count('yes', counts['yes'])
                vote.set_count('no', counts['no'])
                vote.set_count('other', counts['other'])

                # Veto override.
                if vote.motion_text == 'OVERRIDE':
                    # Per the NJ leg's glossary, a veto override requires
                    # 2/3ds of each chamber. 27 in the senate, 54 in the house.
                    # http://www.njleg.state.nj.us/legislativepub/glossary.asp
                    if vote.chamber == 'lower':
                        vote.result = 'pass' if counts['yes'] >= 54 else 'fail'
                    elif vote['chamber'] == 'upper':
                        vote.result = 'pass' if counts['yes'] >= 27 else 'fail'
                else:
                    # Regular vote.
                    vote.result = 'pass' if counts['yes'] > counts['no'] else 'fail'

                vote.add_source('http://www.njleg.state.nj.us/downloads.asp')
                yield vote

        # Actions
        bill_action_csv = self.access_to_csv('BillHist')
        actor_map = {'A': 'lower', 'G': 'executive', 'S': 'upper'}

        for rec in bill_action_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in action database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            action = rec["Action"]
            date = rec["DateAction"]
            date = datetime.strptime(date, "%m/%d/%y %H:%M:%S")
            actor = actor_map[rec["House"]]
            comment = rec["Comment"]
            action, atype = self.categorize_action(action, bill_id)
            if comment:
                action += (' ' + comment)
            bill.add_action(
                action,
                date=TIMEZONE.localize(date),
                classification=atype,
                chamber=actor,
            )

        # Subjects
        subject_csv = self.access_to_csv('BillSubj')
        for rec in subject_csv:
            bill_id = rec['BillType'].strip() + str(int(rec['BillNumber']))
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in subject database' % bill_id)
                continue
            bill = bill_dict.get(bill_id)
            if bill:
                bill.subject.append(rec['SubjectKey'])
            else:
                self.warning('invalid bill id in BillSubj: %s' % bill_id)

        phony_bill_count = 0
        # save all bills at the end
        for bill in bill_dict.values():
            # add sources
            if not bill.actions and not bill.versions:
                self.warning('probable phony bill detected %s',
                             bill.identifier)
                phony_bill_count += 1
            else:
                bill.add_source('http://www.njleg.state.nj.us/downloads.asp')
                yield bill

        if phony_bill_count:
            self.warning('%s total phony bills detected', phony_bill_count)
Ejemplo n.º 8
0
    def scrape_bills(self, session, year_abr):
        # Main Bill information
        main_bill_csv = self.access_to_csv('MainBill')

        # keep a dictionary of bills (mapping bill_id to Bill obj)
        bill_dict = {}

        for rec in main_bill_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["Synopsis"]
            if bill_type[0] == 'A':
                chamber = "lower"
            else:
                chamber = "upper"

            # some bills have a blank title.. just skip it
            if not title:
                continue

            bill = Bill(
                bill_id,
                title=title,
                chamber=chamber,
                legislative_session=session,
                classification=self._bill_types[bill_type[1:]],
            )
            if rec['IdenticalBillNumber'].strip():
                bill.add_related_bill(
                    rec['IdenticalBillNumber'].split()[0],
                    legislative_session=session,
                    relation_type='companion',
                )

            # TODO: last session info is in there too
            bill_dict[bill_id] = bill

        # Sponsors
        bill_sponsors_csv = self.access_to_csv('BillSpon')

        for rec in bill_sponsors_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in sponsor database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            name = rec["Sponsor"]
            sponsor_type = rec["Type"]
            if sponsor_type == 'P':
                sponsor_type = "primary"
            else:
                sponsor_type = "cosponsor"
            bill.add_sponsorship(name,
                                 classification=sponsor_type,
                                 entity_type='person',
                                 primary=sponsor_type == 'primary')

        # Documents
        bill_document_csv = self.access_to_csv('BillWP')

        for rec in bill_document_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in document database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            document = rec["Document"]
            document = document.split('\\')
            document = document[-2] + "/" + document[-1]

            # doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document)
            htm_url = 'http://www.njleg.state.nj.us/{}/Bills/{}'.format(
                year_abr,
                document.replace('.DOC', '.HTM'),
            )

            # name document based _doctype
            try:
                doc_name = self._doctypes[rec['DocType']]
            except KeyError:
                raise Exception('unknown doctype %s on %s' %
                                (rec['DocType'], bill_id))
            if rec['Comment']:
                doc_name += ' ' + rec['Comment']

            # Clean HTMX links.
            if htm_url.endswith('HTMX'):
                htm_url = re.sub('X$', '', htm_url)

            if rec['DocType'] in self._version_types:
                if htm_url.endswith('HTM'):
                    mimetype = 'text/html'
                elif htm_url.endswith('wpd'):
                    mimetype = 'application/vnd.wordperfect'
                try:
                    bill.add_version_link(doc_name,
                                          htm_url,
                                          media_type=mimetype)
                except ValueError:
                    self.warning(
                        "Couldn't find a document for bill {}".format(bill_id))
                    pass
            else:
                bill.add_document_link(doc_name, htm_url)

        # Votes
        next_year = int(year_abr) + 1
        vote_info_list = [
            'A%s' % year_abr,
            'A%s' % next_year,
            'S%s' % year_abr,
            'S%s' % next_year,
            'CA%s-%s' % (year_abr, next_year),
            'CS%s-%s' % (year_abr, next_year),
        ]

        for filename in vote_info_list:
            s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename
            try:
                s_vote_zip, resp = self.urlretrieve(s_vote_url)
            except scrapelib.FTPError:
                self.warning('could not find %s' % s_vote_url)
                continue
            zippedfile = zipfile.ZipFile(s_vote_zip)
            for vfile in ["%s.txt" % (filename), "%sEnd.txt" % (filename)]:
                try:
                    vote_file = io.TextIOWrapper(zippedfile.open(vfile, 'r'))
                except KeyError:
                    #
                    # Right, so, 2011 we have an "End" file with more
                    # vote data than was in the original dump.
                    #
                    self.warning("No such file: %s" % (vfile))
                    continue

                vdict_file = csv.DictReader(vote_file)

                votes = {}
                if filename.startswith('A') or filename.startswith('CA'):
                    chamber = "lower"
                else:
                    chamber = "upper"

                if filename.startswith('C'):
                    vote_file_type = 'committee'
                else:
                    vote_file_type = 'chamber'

                for rec in vdict_file:
                    if vote_file_type == 'chamber':
                        bill_id = rec["Bill"].strip()
                        leg = rec["Full_Name"]

                        date = rec["Session_Date"]
                        action = rec["Action"]
                        leg_vote = rec["Legislator_Vote"]
                        vote_parts = (bill_id, chamber, action)
                    else:
                        bill_id = '%s%s' % (rec['Bill_Type'],
                                            rec['Bill_Number'])
                        leg = rec['Name']
                        # drop time portion
                        date = rec['Agenda_Date'].split()[0]
                        # make motion readable
                        action = self._com_vote_motions[rec['BillAction']]
                        # first char (Y/N) use [0:1] to ignore ''
                        leg_vote = rec['LegislatorVote'][0:1]
                        committee = rec['Committee_House']
                        vote_parts = (bill_id, chamber, action, committee)

                    date = datetime.strptime(date, "%m/%d/%Y")
                    vote_id = '_'.join(vote_parts).replace(' ', '_')

                    if vote_id not in votes:
                        votes[vote_id] = VoteEvent(
                            start_date=TIMEZONE.localize(date),
                            chamber=chamber,
                            motion_text=action,
                            classification='passage',
                            result=None,
                            bill=bill_dict[bill_id])
                    if leg_vote == "Y":
                        votes[vote_id].vote('yes', leg)
                    elif leg_vote == "N":
                        votes[vote_id].vote('no', leg)
                    else:
                        votes[vote_id].vote('other', leg)

            # remove temp file
            os.remove(s_vote_zip)

            # Counts yes/no/other votes and saves overall vote
            for vote in votes.values():
                counts = collections.defaultdict(int)
                for count in vote.votes:
                    counts[count['option']] += 1
                vote.set_count('yes', counts['yes'])
                vote.set_count('no', counts['no'])
                vote.set_count('other', counts['other'])

                # Veto override.
                if vote.motion_text == 'OVERRIDE':
                    # Per the NJ leg's glossary, a veto override requires
                    # 2/3ds of each chamber. 27 in the senate, 54 in the house.
                    # http://www.njleg.state.nj.us/legislativepub/glossary.asp
                    if 'lower' in vote.bill:
                        vote.result = 'pass' if counts['yes'] >= 54 else 'fail'
                    elif 'upper' in vote.bill:
                        vote.result = 'pass' if counts['yes'] >= 27 else 'fail'
                else:
                    # Regular vote.
                    vote.result = 'pass' if counts['yes'] > counts[
                        'no'] else 'fail'

                vote.add_source('http://www.njleg.state.nj.us/downloads.asp')
                yield vote

        # Actions
        bill_action_csv = self.access_to_csv('BillHist')
        actor_map = {'A': 'lower', 'G': 'executive', 'S': 'upper'}

        for rec in bill_action_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in action database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            action = rec["Action"]
            date = rec["DateAction"]
            date = datetime.strptime(date, "%m/%d/%y %H:%M:%S")
            actor = actor_map[rec["House"]]
            comment = rec["Comment"]
            action, atype = self.categorize_action(action, bill_id)
            if comment:
                action += (' ' + comment)
            bill.add_action(
                action,
                date=TIMEZONE.localize(date),
                classification=atype,
                chamber=actor,
            )

        # Subjects
        subject_csv = self.access_to_csv('BillSubj')
        for rec in subject_csv:
            bill_id = rec['BillType'].strip() + str(int(rec['BillNumber']))
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in subject database' % bill_id)
                continue
            bill = bill_dict.get(bill_id)
            if bill:
                bill.subject.append(rec['SubjectKey'])
            else:
                self.warning('invalid bill id in BillSubj: %s' % bill_id)

        phony_bill_count = 0
        # save all bills at the end
        for bill in bill_dict.values():
            # add sources
            if not bill.actions and not bill.versions:
                self.warning('probable phony bill detected %s',
                             bill.identifier)
                phony_bill_count += 1
            else:
                bill.add_source('http://www.njleg.state.nj.us/downloads.asp')
                yield bill

        if phony_bill_count:
            self.warning('%s total phony bills detected', phony_bill_count)
Ejemplo n.º 9
0
    def _parse_house_bill(self, url, session):
        # using the print page makes the page simpler, and also *drastically* smaller
        # (8k rather than 100k)
        url = re.sub("billsummary", "billsummaryprn", url)
        url = '%s/%s' % (self._house_base_url, url)

        # the URL is an iframed version now, so swap in for the actual bill page

        url = url.replace('Bill.aspx', 'BillContent.aspx')
        url = url.replace('&code=R', '&code=R&style=new')

        # http://www.house.mo.gov/Bill.aspx?bill=HB26&year=2017&code=R
        # http://www.house.mo.gov/BillContent.aspx?bill=HB26&year=2017&code=R&style=new

        bill_page = self.get(url).text
        bill_page = lxml.html.fromstring(bill_page)
        bill_page.make_links_absolute(url)

        bill_id = bill_page.xpath('//*[@class="entry-title"]/div')
        if len(bill_id) == 0:
            self.info("WARNING: bill summary page is blank! (%s)" % url)
            self._bad_urls.append(url)
            return
        bill_id = bill_id[0].text_content()
        bill_id = clean_text(bill_id)

        bill_desc = bill_page.xpath(
            '//*[@class="BillDescription"]')[0].text_content()
        bill_desc = clean_text(bill_desc)

        table_rows = bill_page.xpath('//table/tr')
        # if there is a cosponsor all the rows are pushed down one for the extra row
        # for the cosponsor:
        cosponsorOffset = 0
        if table_rows[2][0].text_content().strip() == 'Co-Sponsor:':
            cosponsorOffset = 1

        lr_label_tag = table_rows[3 + cosponsorOffset]
        assert lr_label_tag[0].text_content().strip() == 'LR Number:'
        # bill_lr = lr_label_tag[1].text_content()

        lastActionOffset = 0
        if table_rows[4 + cosponsorOffset][0].text_content().strip(
        ) == 'Governor Action:':
            lastActionOffset = 1
        official_title_tag = table_rows[5 + cosponsorOffset + lastActionOffset]
        assert official_title_tag[0].text_content().strip() == 'Bill String:'
        official_title = official_title_tag[1].text_content()

        # could substitute the description for the name,
        # but keeping it separate for now.

        bill_type = "bill"
        triplet = bill_id[:3]

        if triplet in bill_types:
            bill_type = bill_types[triplet]
            bill_number = int(bill_id[3:].strip())
        else:
            bill_number = int(bill_id[3:])

        subs = []
        bid = bill_id.replace(" ", "")

        if bid in self._subjects:
            subs = self._subjects[bid]
            self.info("With subjects for this bill")

        self.info(bid)

        if bill_desc == "":
            if bill_number <= 20:
                # blank bill titles early in session are approp. bills
                bill_desc = 'Appropriations Bill'
            else:
                self.error("Blank title. Skipping. {} / {} / {}".format(
                    bill_id, bill_desc, official_title))
                return

        bill = Bill(
            bill_id,
            chamber='lower',
            title=bill_desc,
            legislative_session=self._session_id,
            classification=bill_type,
        )
        bill.subject = subs
        bill.add_title(official_title, note='official')

        bill.add_source(url)

        bill_sponsor = clean_text(table_rows[0][1].text_content())
        # try:
        #     bill_sponsor_link = table_rows[0][1][0].attrib['href']
        # except IndexError:
        #     return
        bill.add_sponsorship(
            bill_sponsor,
            entity_type='person',
            classification='primary',
            primary=True,
        )

        # check for cosponsors
        sponsors_url, = bill_page.xpath(
            "//a[contains(@href, 'CoSponsors.aspx')]/@href")
        self._parse_cosponsors_from_bill(bill, sponsors_url)

        # actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0]
        # actions_link = '%s/%s' % (self._house_base_url,actions_link_tag.attrib['href'])
        # actions_link = re.sub("content", "print", actions_link)

        actions_link, = bill_page.xpath(
            "//a[contains(@href, 'BillActions.aspx')]/@href")
        yield from self._parse_house_actions(bill, actions_link)

        # get bill versions
        doc_tags = bill_page.xpath('//div[@class="BillDocuments"][1]/span')
        for doc_tag in reversed(doc_tags):
            doc = clean_text(doc_tag.text_content())
            text_url = '%s%s' % (self._house_base_url,
                                 doc_tag[0].attrib['href'])
            bill.add_document_link(doc, text_url, media_type='text/html')

        # get bill versions
        version_tags = bill_page.xpath('//div[@class="BillDocuments"][2]/span')
        for version_tag in reversed(version_tags):
            version = clean_text(version_tag.text_content())
            for vurl in version_tag.xpath(".//a"):
                if vurl.text == 'PDF':
                    mimetype = 'application/pdf'
                else:
                    mimetype = 'text/html'
                bill.add_version_link(version,
                                      vurl.attrib['href'],
                                      media_type=mimetype,
                                      on_duplicate='ignore')

        # house bill versions
        # everything between the row containing "Bill Text"" and the next div.DocHeaderRow
        version_rows = bill_page.xpath(
            '//div[contains(text(),"Bill Text")]/'
            'following-sibling::div[contains(@class,"DocRow") '
            'and count(preceding-sibling::div[contains(@class,"DocHeaderRow")])=1]'
        )
        for row in version_rows:
            # some rows are just broken links, not real versions
            if row.xpath('.//div[contains(@class,"textType")]/a/@href'):
                version = row.xpath(
                    './/div[contains(@class,"textType")]/a/text()')[0].strip()
                path = row.xpath(
                    './/div[contains(@class,"textType")]/a/@href')[0].strip()
                if '.pdf' in path:
                    mimetype = 'application/pdf'
                else:
                    mimetype = 'text/html'
                bill.add_version_link(version,
                                      path,
                                      media_type=mimetype,
                                      on_duplicate='ignore')

        # house bill summaries
        # everything between the row containing "Bill Summary"" and the next div.DocHeaderRow
        summary_rows = bill_page.xpath(
            '//div[contains(text(),"Bill Summary")]/'
            'following-sibling::div[contains(@class,"DocRow") '
            'and count(following-sibling::div[contains(@class,"DocHeaderRow")])=1]'
        )

        # if there are no amedments, we need a different xpath for summaries
        if not summary_rows:
            summary_rows = bill_page.xpath(
                '//div[contains(text(),"Bill Summary")]/'
                'following-sibling::div[contains(@class,"DocRow")]')

        for row in reversed(summary_rows):
            version = row.xpath(
                './/div[contains(@class,"textType")]/a/text()')[0].strip()
            if version:
                path = row.xpath(
                    './/div[contains(@class,"textType")]/a/@href')[0].strip()
                summary_name = 'Bill Summary ({})'.format(version)
                if '.pdf' in path:
                    mimetype = 'application/pdf'
                else:
                    mimetype = 'text/html'
                bill.add_document_link(summary_name,
                                       path,
                                       media_type=mimetype,
                                       on_duplicate='ignore')

        # house bill amendments
        amendment_rows = bill_page.xpath(
            '//div[contains(text(),"Amendment")]/'
            'following-sibling::div[contains(@class,"DocRow")]')

        for row in reversed(amendment_rows):
            version = row.xpath(
                './/div[contains(@class,"DocInfoCell")]/a[1]/text()')[0].strip(
                )
            path = row.xpath(
                './/div[contains(@class,"DocInfoCell")]/a[1]/@href')[0].strip(
                )
            summary_name = 'Amendment {}'.format(version)

            defeated_icon = row.xpath('.//img[contains(@title,"Defeated")]')
            if defeated_icon:
                summary_name = '{} (Defeated)'.format(summary_name)

            adopted_icon = row.xpath('.//img[contains(@title,"Adopted")]')
            if adopted_icon:
                summary_name = '{} (Adopted)'.format(summary_name)

            distributed_icon = row.xpath(
                './/img[contains(@title,"Distributed")]')
            if distributed_icon:
                summary_name = '{} (Distributed)'.format(summary_name)

            if '.pdf' in path:
                mimetype = 'application/pdf'
            else:
                mimetype = 'text/html'
            bill.add_version_link(summary_name,
                                  path,
                                  media_type=mimetype,
                                  on_duplicate='ignore')

        yield bill
Ejemplo n.º 10
0
    def scrape(self):
        three_days_ago = datetime.datetime.now() - datetime.timedelta(3)
        for matter in self.matters(three_days_ago):
            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)):
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Board of Directors"})

            legistar_web = self.legislation_detail_url(matter_id)
            legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id):
                act = bill.add_action(**action)

                if action['description'] == 'Referred':
                    body_name = matter['MatterBodyName']
                    act.add_related_entity(
                        body_name,
                        'organization',
                        entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote
                if result:
                    vote_event = VoteEvent(
                        legislative_session=bill.legislative_session,
                        motion_text=action['description'],
                        organization=action['organization'],
                        classification=None,
                        start_date=action['date'],
                        result=result,
                        bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes:
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(
                            raw_option, raw_option)
                        vote_event.vote(clean_option,
                                        vote['VotePersonName'].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic['MatterIndexName'].strip())

            bill.add_version_link(
                'Board Report',
                'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'
                .format(matter_id),
                media_type="application/pdf")

            for attachment in self.attachments(matter_id):
                if attachment['MatterAttachmentName']:
                    bill.add_document_link(
                        attachment['MatterAttachmentName'],
                        attachment['MatterAttachmentHyperlink'],
                        media_type="application/pdf")

            bill.extras = {'local_classification': matter['MatterTypeName']}

            text = self.text(matter_id)

            if text:
                if text['MatterTextPlain']:
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf']:
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                        u'\u0000', '')

            yield bill
Ejemplo n.º 11
0
    def get_bill(self, matter):
        '''Make Bill object from given matter.'''
        matter_id = matter['MatterId']
        if matter_id in DUPLICATED_ACTIONS:
            return None

        date = matter['MatterIntroDate']
        title = matter['MatterName']
        identifier = matter['MatterFile']

        if not all((date, title, identifier)):
            return None

        leg_type = BILL_TYPES[matter['MatterTypeName']]

        bill_session = self.sessions(self.toTime(date))

        bill = Bill(identifier=identifier,
                    title=title,
                    classification=leg_type,
                    legislative_session=bill_session,
                    from_organization={"name": "New York City Council"})

        legistar_web = matter['legistar_url']
        legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)

        bill.add_source(legistar_web, note='web')
        bill.add_source(legistar_api, note='api')

        if matter['MatterTitle']:
            bill.add_title(matter['MatterTitle'])

        if matter['MatterEXText5']:
            bill.add_abstract(matter['MatterEXText5'], note='')

        try:
            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)
        except KeyError:
            self.version_errors.append(legistar_web)
            return None

        for attachment in self.attachments(matter_id):

            if attachment['MatterAttachmentId'] == 103315:  # Duplicate
                return None

            if attachment['MatterAttachmentName']:
                bill.add_document_link(attachment['MatterAttachmentName'],
                                       attachment['MatterAttachmentHyperlink'],
                                       media_type='application/pdf')

        for topic in self.topics(matter_id):
            bill.add_subject(topic['MatterIndexName'].strip())

        for relation in self.relations(matter_id):
            try:
                related_bill = self.endpoint(
                    '/matters/{0}', relation['MatterRelationMatterId'])
            except scrapelib.HTTPError:
                return None
            else:
                date = related_bill['MatterIntroDate']
                related_bill_session = self.session(self.toTime(date))
                identifier = related_bill['MatterFile']
                bill.add_related_bill(identifier=identifier,
                                      legislative_session=related_bill_session,
                                      relation_type='companion')

        try:
            text = self.text(matter_id)
        except KeyError:
            self.version_errors.append(legistar_web)
            return None

        bill.extras['local_classification'] = matter['MatterTypeName']

        if text:
            if text['MatterTextPlain']:
                bill.extras['plain_text'] = text['MatterTextPlain'].replace(
                    u'\u0000', '')

            if text['MatterTextRtf']:
                bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                    u'\u0000', '')

        return bill
Ejemplo n.º 12
0
    def scrape_bill(self, session, bill_url):
        page = self.get(bill_url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(bill_url)

        try:
            bill_id = page.xpath('//span[@id="lblBillNumber"]/a[1]')[0].text
        except IndexError:
            self.logger.warning("Something is wrong with bill page, skipping.")
            return
        secondary_bill_id = page.xpath('//span[@id="lblCompNumber"]/a[1]')

        # checking if there is a matching bill
        if secondary_bill_id:
            secondary_bill_id = secondary_bill_id[0].text
            # swap ids if * is in secondary_bill_id
            if '*' in secondary_bill_id:
                bill_id, secondary_bill_id = secondary_bill_id, bill_id
                secondary_bill_id = secondary_bill_id.strip()
            secondary_bill_id = secondary_bill_id.replace('  ', ' ')

        bill_id = bill_id.replace('*', '').replace('  ', ' ').strip()

        if 'B' in bill_id:
            bill_type = 'bill'
        elif 'JR' in bill_id:
            bill_type = 'joint resolution'
        elif 'R' in bill_id:
            bill_type = 'resolution'

        primary_chamber = 'lower' if 'H' in bill_id else 'upper'
        # secondary_chamber = 'upper' if primary_chamber == 'lower' else 'lower'

        title = page.xpath("//span[@id='lblAbstract']")[0].text
        if title is None:
            msg = '%s detail page was missing title info.'
            self.logger.warning(msg % bill_id)
            return

        # bill subject
        subject_pos = title.find('-')
        subjects = [s.strip() for s in title[:subject_pos - 1].split(',')]
        subjects = filter(None, subjects)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=primary_chamber,
            title=title,
            classification=bill_type,
        )
        for subject in subjects:
            bill.add_subject(subject)

        if secondary_bill_id:
            bill.add_identifier(secondary_bill_id)

        bill.add_source(bill_url)

        # Primary Sponsor
        sponsor = page.xpath("//span[@id='lblBillPrimeSponsor']")[0].text_content().split("by")[-1]
        sponsor = sponsor.replace('*', '').strip()
        if sponsor:
            bill.add_sponsorship(
                sponsor,
                classification='primary',
                entity_type='person',
                primary=True,
            )

        # bill text
        btext = page.xpath("//span[@id='lblBillNumber']/a")[0]
        bill.add_version_link('Current Version', btext.get('href'),
                              media_type='application/pdf')

        # documents
        summary = page.xpath('//a[contains(@href, "BillSummaryArchive")]')
        if summary:
            bill.add_document_link('Summary', summary[0].get('href'))
        fiscal = page.xpath('//span[@id="lblFiscalNote"]//a')
        if fiscal:
            bill.add_document_link('Fiscal Note', fiscal[0].get('href'))
        amendments = page.xpath('//a[contains(@href, "/Amend/")]')
        for amendment in amendments:
            bill.add_document_link('Amendment ' + amendment.text, amendment.get('href'))
        # amendment notes in image with alt text describing doc inside <a>
        amend_fns = page.xpath('//img[contains(@alt, "Fiscal Memo")]')
        for afn in amend_fns:
            bill.add_document_link(
                afn.get('alt'),
                afn.getparent().get('href'),
                on_duplicate='ignore'
            )

        # actions
        atable = page.xpath("//table[@id='gvBillActionHistory']")[0]
        actions_from_table(bill, atable)

        # if there is a matching bill
        if secondary_bill_id:
            # secondary sponsor
            secondary_sponsor = page.xpath(
                "//span[@id='lblCompPrimeSponsor']")[0].text_content().split("by")[-1]
            secondary_sponsor = secondary_sponsor.replace('*', '').replace(')', '').strip()
            # Skip black-name sponsors.
            if secondary_sponsor:
                bill.add_sponsorship(
                    secondary_sponsor,
                    classification='primary',
                    entity_type='person',
                    primary=True,
                )

            # secondary actions
            cotable = page.xpath("//table[@id='gvCoActionHistory']")[0]
            actions_from_table(bill, cotable)

        # votes
        yield from self.scrape_vote_events(bill, page, bill_url)

        bill.actions.sort(key=lambda a: a['date'])
        yield bill
Ejemplo n.º 13
0
    def scrape_bill(self, chamber, session, bill_id):
        # try and get bill for the first year of the session biennium
        url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % (
            session[:4], bill_id.replace(' ', '-'))
        html = self.get(url).text
        # Otherwise, try second year of the session biennium
        if ('Page Not Found' in html or
                'The bill you are looking for is not available yet' in html):
            url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % (
                session[-4:], bill_id.replace(' ', '-'))
            html = self.get(url).text
            if ('Page Not Found' in html or
                    'The bill you are looking for is not available yet' in html):
                self.warning("Cannot open bill page for {}; skipping".format(bill_id))
                return

        doc = lxml.html.fromstring(html)
        doc.make_links_absolute('http://legislature.mi.gov')

        title = doc.xpath('//span[@id="frg_billstatus_ObjectSubject"]')[0].text_content()

        # get B/R/JR/CR part and look up bill type
        bill_type = bill_types[bill_id.split(' ')[0][1:]]

        bill = Bill(bill_id, session, title, chamber=chamber,
                    classification=bill_type)
        bill.add_source(url)

        # sponsors
        sponsors = doc.xpath('//span[@id="frg_billstatus_SponsorList"]/a')
        for sponsor in sponsors:
            name = sponsor.text.replace(u'\xa0', ' ')
            # sometimes district gets added as a link
            if name.isnumeric():
                continue

            if len(sponsors) > 1:
                classification = (
                    'primary'
                    if sponsor.tail and 'primary' in sponsor.tail
                    else 'cosponsor'
                )
            else:
                classification = 'primary'
            bill.add_sponsorship(
                name=name,
                chamber=chamber,
                entity_type='person',
                primary=classification == 'primary',
                classification=classification,
            )

        bill.subject = doc.xpath('//span[@id="frg_billstatus_CategoryList"]/a/text()')

        # actions (skip header)
        for row in doc.xpath('//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]:
            tds = row.xpath('td')  # date, journal link, action
            date = tds[0].text_content()
            journal = tds[1].text_content()
            action = tds[2].text_content()
            date = TIMEZONE.localize(datetime.datetime.strptime(date, "%m/%d/%Y"))
            # instead of trusting upper/lower case, use journal for actor
            actor = 'upper' if 'SJ' in journal else 'lower'
            classification = categorize_action(action)
            bill.add_action(action, date, chamber=actor, classification=classification)

            # check if action mentions a sub
            submatch = re.search(r'WITH SUBSTITUTE\s+([\w\-\d]+)', action, re.IGNORECASE)
            if submatch and tds[2].xpath('a'):
                version_url = tds[2].xpath('a/@href')[0]
                version_name = tds[2].xpath('a/text()')[0].strip()
                version_name = 'Substitute {}'.format(version_name)
                self.info("Found Substitute {}".format(version_url))
                if version_url.lower().endswith('.pdf'):
                    mimetype = 'application/pdf'
                elif version_url.lower().endswith('.htm'):
                    mimetype = 'text/html'
                bill.add_version_link(version_name, version_url, media_type=mimetype)

            # check if action mentions a vote
            rcmatch = re.search(r'Roll Call # (\d+)', action, re.IGNORECASE)
            if rcmatch:
                rc_num = rcmatch.groups()[0]
                # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011
                journal_link = tds[1].xpath('a/@href')
                if journal_link:
                    objectname = journal_link[0].rsplit('=', 1)[-1]
                    chamber_name = {'upper': 'Senate', 'lower': 'House'}[actor]
                    vote_url = BASE_URL + '/documents/%s/Journal/%s/htm/%s.htm' % (
                        session, chamber_name, objectname)
                    results = self.parse_roll_call(vote_url, rc_num)
                    vote = VoteEvent(
                        start_date=date,
                        chamber=actor,
                        bill=bill,
                        motion_text=action,
                        result='pass' if len(results['yes']) > len(results['no']) else 'fail',
                        classification='passage',
                    )

                    # check the expected counts vs actual
                    count = re.search(r'YEAS (\d+)', action, re.IGNORECASE)
                    count = int(count.groups()[0]) if count else 0
                    if count != len(results['yes']):
                        self.warning('vote count mismatch for %s %s, %d != %d' %
                                     (bill_id, action, count, len(results['yes'])))
                    count = re.search(r'NAYS (\d+)', action, re.IGNORECASE)
                    count = int(count.groups()[0]) if count else 0
                    if count != len(results['no']):
                        self.warning('vote count mismatch for %s %s, %d != %d' %
                                     (bill_id, action, count, len(results['no'])))

                    vote.set_count('yes', len(results['yes']))
                    vote.set_count('no', len(results['no']))
                    vote.set_count('other', len(results['other']))

                    for name in results['yes']:
                        vote.yes(name)
                    for name in results['no']:
                        vote.no(name)
                    for name in results['other']:
                        vote.vote('other', name)

                    vote.add_source(vote_url)
                    yield vote
                else:
                    self.warning("missing journal link for %s %s" %
                                 (bill_id, journal))

        # versions
        for row in doc.xpath('//table[@id="frg_billstatus_DocumentGridTable"]/tr'):
            parsed = self.parse_doc_row(row)
            if parsed:
                name, url = parsed
                if url.endswith('.pdf'):
                    mimetype = 'application/pdf'
                elif url.endswith('.htm'):
                    mimetype = 'text/html'
                bill.add_version_link(name, url, media_type=mimetype)

        # documents
        for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                name, url = document
                bill.add_document_link(name, url)
        for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                name, url = document
                bill.add_document_link(name, url)

        yield bill
Ejemplo n.º 14
0
    def parse_bill(self, chamber, session, bill_id, url):
        try:
            page = self.lxmlize(url)
        except scrapelib.HTTPError as e:
            self.logger.warning(e)
            return

        last_action = self.parse_bill_field(
            page, 'Last Action').xpath('text()')[0]
        if 'WITHDRAWN' in last_action.upper():
            self.info("{} Withdrawn, skipping".format(bill_id))
            return

        version = self.parse_bill_field(page, 'Bill Documents')
        source_url = version.xpath('a[1]/@href')[0]
        version_title = version.xpath('a[1]/text()')[0].strip()

        if version is None:
            # Bill withdrawn
            self.logger.warning('Bill withdrawn.')
            return
        else:
            if source_url.endswith('.doc'):
                mimetype = 'application/msword'
            elif source_url.endswith('.pdf'):
                mimetype = 'application/pdf'

        title = self.parse_bill_field(page, 'Title').text_content()

        # actions = self.get_nodes(
        #     page,
        #     '//div[@class="StandardText leftDivMargin"]/'
        #     'div[@class="StandardText"][last()]//text()[normalize-space()]')

        if 'CR' in bill_id:
            bill_type = 'concurrent resolution'
        elif 'JR' in bill_id:
            bill_type = 'joint resolution'
        elif 'R' in bill_id:
            bill_type = 'resolution'
        else:
            bill_type = 'bill'

        bill = Bill(bill_id, legislative_session=session, chamber=chamber,
                    title=title, classification=bill_type)
        bill.subject = self._subjects[bill_id]
        bill.add_source(url)

        bill.add_version_link(version_title, source_url, media_type=mimetype)

        self.parse_actions(page, bill, chamber)
        self.parse_subjects(page, bill)

        # LM is "Locally Mandated fiscal impact"
        fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]')
        for fiscal_note in fiscal_notes:
            source_url = fiscal_note.attrib['href']
            if source_url.endswith('.doc'):
                mimetype = 'application/msword'
            elif source_url.endswith('.pdf'):
                mimetype = 'application/pdf'

            bill.add_document_link(
                "Fiscal Note", source_url, media_type=mimetype)

        for link in page.xpath("//td/span/a[contains(@href, 'Legislator-Profile')]"):
            bill.add_sponsorship(link.text.strip(), classification='primary',
                                 entity_type='person', primary=True)

        bdr_no = self.parse_bill_field(page, 'Bill Request Number')
        if bdr_no.xpath('text()'):
            bdr = bdr_no.xpath('text()')[0].strip()
            bill.extras["BDR"] = bdr

        yield bill
Ejemplo n.º 15
0
def test_full_bill():
    create_jurisdiction()
    person = Person.objects.create(id='person-id', name='Adam Smith')
    org = ScrapeOrganization(name='House', classification='lower')
    com = ScrapeOrganization(name='Arbitrary Committee', classification='committee',
                             parent_id=org._id)

    oldbill = ScrapeBill('HB 99', '1899', 'Axe & Tack Tax Act',
                         classification='tax bill', from_organization=org._id)

    bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act',
                      classification='tax bill', from_organization=org._id)
    bill.subject = ['taxes', 'axes']
    bill.add_identifier('SB 9')
    bill.add_title('Tack & Axe Tax Act')
    bill.add_action('introduced in house', '1900-04-01', chamber='lower')
    act = bill.add_action('sent to arbitrary committee', '1900-04-04', chamber='lower')
    act.add_related_entity('arbitrary committee', 'organization', com._id)
    bill.add_related_bill("HB 99", legislative_session="1899", relation_type="prior-session")
    bill.add_sponsorship('Adam Smith', classification='extra sponsor', entity_type='person',
                         primary=False, entity_id=person.id)
    bill.add_sponsorship('Jane Smith', classification='lead sponsor', entity_type='person',
                         primary=True)
    bill.add_abstract('This is an act about axes and taxes and tacks.', note="official")
    bill.add_document_link('Fiscal Note', 'http://example.com/fn.pdf',
                           media_type='application/pdf')
    bill.add_document_link('Fiscal Note', 'http://example.com/fn.html', media_type='text/html')
    bill.add_version_link('Fiscal Note', 'http://example.com/v/1', media_type='text/html')
    bill.add_source('http://example.com/source')

    # import bill
    oi = OrganizationImporter('jid')
    oi.import_data([org.as_dict(), com.as_dict()])

    pi = PersonImporter('jid')
    pi.json_to_db_id['person-id'] = 'person-id'
    # Since we have to create this person behind the back of the import
    # transaction, we'll fake the json-id to db-id, since they match in this
    # case. This is *really* getting at some implementation detail, but it's
    # the cleanest way to ensure we short-circut the json id lookup.

    BillImporter('jid', oi, pi).import_data([oldbill.as_dict(), bill.as_dict()])

    # get bill from db and assert it imported correctly
    b = Bill.objects.get(identifier='HB 1')
    assert b.from_organization.classification == 'lower'
    assert b.identifier == bill.identifier
    assert b.title == bill.title
    assert b.classification == bill.classification
    assert b.subject == ['taxes', 'axes']
    assert b.abstracts.get().note == 'official'

    # other_title, other_identifier added
    assert b.other_titles.get().title == 'Tack & Axe Tax Act'
    assert b.other_identifiers.get().identifier == 'SB 9'

    # actions
    actions = list(b.actions.all())
    assert len(actions) == 2
    # ensure order was preserved (if this breaks it'll be intermittent)
    assert actions[0].organization == Organization.objects.get(classification='lower')
    assert actions[0].description == "introduced in house"
    assert actions[1].description == "sent to arbitrary committee"
    assert (actions[1].related_entities.get().organization ==
            Organization.objects.get(classification='committee'))

    # related_bills were added
    rb = b.related_bills.get()
    assert rb.identifier == 'HB 99'

    # and bill got resolved
    assert rb.related_bill.identifier == 'HB 99'

    # sponsors added, linked & unlinked
    sponsorships = b.sponsorships.all()
    assert len(sponsorships) == 2
    for ss in sponsorships:
        if ss.primary:
            assert ss.person is None
            assert ss.organization is None
        else:
            assert ss.person == person

    # versions & documents with their links
    versions = b.versions.all()
    assert len(versions) == 1
    assert versions[0].links.count() == 1
    documents = b.documents.all()
    assert len(documents) == 1
    assert documents[0].links.count() == 2

    # sources
    assert b.sources.count() == 1
Ejemplo n.º 16
0
    def scrape_bill(self, chamber, session, bill_id, url):
        page = self.lxmlize(url)

        (header, ) = page.xpath('//h3[@class="heading"]/text()')
        title = header.replace(bill_id, "").strip()

        if '.B. ' in bill_id:
            bill_type = 'bill'
        elif bill_id.startswith('H.R. ') or bill_id.startswith('S.R. '):
            bill_type = 'resolution'
        elif '.C.R. ' in bill_id:
            bill_type = 'concurrent resolution'
        elif '.J.R. ' in bill_id:
            bill_type = 'joint resolution'

        for flag in SUB_BLACKLIST:
            if flag in bill_id:
                bill_id = bill_id.replace(flag, " ")
        bill_id = re.sub("\s+", " ", bill_id).strip()

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=bill_type)
        bill.add_source(url)

        primary_info = page.xpath('//div[@id="billsponsordiv"]')
        for info in primary_info:
            (title, name) = [x.strip() for x
                             in info.xpath('.//text()')
                             if x.strip()]
            assert title == "Bill Sponsor:"
            name = name.replace("Sen. ", "").replace("Rep. ", "")
            bill.add_sponsorship(name,
                                 classification='primary',
                                 entity_type='person',
                                 primary=True)
        floor_info = page.xpath('//div[@id="floorsponsordiv"]//text()')
        floor_info = [x.strip() for x in floor_info if x.strip()]
        if len(floor_info) in (0, 1):
            # This indicates that no floor sponsor was found
            pass
        elif len(floor_info) == 2:
            assert floor_info[0] == "Floor Sponsor:"
            floor_sponsor = floor_info[1].replace("Sen. ", "").replace("Rep. ", "")
            bill.add_sponsorship(floor_sponsor,
                                 classification='cosponsor',
                                 entity_type='person',
                                 primary=False)
        else:
            raise AssertionError("Unexpected floor sponsor HTML found")

        versions = page.xpath(
            '//b[text()="Bill Text"]/following-sibling::ul/li/'
            'a[text() and not(text()=" ")]'
        )

        for version in versions:

            # sometimes the href is on the following <a> tag and the tag we
            # have has an onclick
            url = version.get('href')
            if not url:
                url = version.xpath('following-sibling::a[1]/@href')[0]

            bill.add_version_link(
                version.xpath('text()')[0].strip(),
                url,
                media_type='application/pdf'
            )

        for related in page.xpath('//b[text()="Related Documents "]/following-sibling::ul/li/'
                                  'a[contains(@class,"nlink")]'):
            href = related.xpath('@href')[0]
            if '.fn.pdf' in href:
                bill.add_document_link("Fiscal Note", href, media_type='application/pdf')
            else:
                text = related.xpath('text()')[0]
                bill.add_document_link(text, href, media_type='application/pdf')

        subjects = []
        for link in page.xpath("//a[contains(@href, 'RelatedBill')]"):
            subjects.append(link.text.strip())
        bill.subject = subjects

        status_table = page.xpath('//div[@id="billStatus"]//table')[0]
        yield from self.parse_status(bill, status_table, chamber)

        yield bill
Ejemplo n.º 17
0
    def scrape_bill_page(self, chamber, session, bill_url, bill_abbreviation):
        page = self.lxmlize(bill_url)
        author = self.get_one_xpath(
            page,
            "//a[@id='ctl00_PageBody_LinkAuthor']/text()")

        def sbp(x): return self.scrape_bare_page(page.xpath(
            "//a[contains(text(), '%s')]" % (x))[0].attrib['href'])

        authors = [x.text for x in sbp("Authors")]

        try:
            digests = sbp("Digests")
        except IndexError:
            digests = []

        try:
            versions = sbp("Text")
        except IndexError:
            versions = []

        try:
            amendments = sbp("Amendments")
        except IndexError:
            amendments = []

        title = page.xpath(
            "//span[@id='ctl00_PageBody_LabelShortTitle']/text()")[0]
        actions = page.xpath(
            "//div[@id='ctl00_PageBody_PanelBillInfo']/"
            "/table[@style='font-size:small']/tr")

        bill_id = page.xpath(
            "//span[@id='ctl00_PageBody_LabelBillID']/text()")[0]

        bill_type = self._bill_types[bill_abbreviation[1:]]
        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=bill_type)
        bill.add_source(bill_url)

        authors.remove(author)
        bill.add_sponsorship(author,
                             classification='primary',
                             entity_type='person',
                             primary=True)
        for author in authors:
            bill.add_sponsorship(author,
                                 classification='cosponsor',
                                 entity_type='person',
                                 primary=False)

        for digest in digests:
            bill.add_document_link(note=digest.text,
                                   url=digest.attrib['href'],
                                   media_type="application/pdf")

        for version in versions:
            bill.add_version_link(note=version.text,
                                  url=version.attrib['href'],
                                  media_type="application/pdf")

        for amendment in amendments:
            bill.add_version_link(note=amendment.text,
                                  url=amendment.attrib['href'],
                                  media_type="application/pdf")

        flags = {
            "prefiled": ["filing"],
            "referred to the committee": ["referral-committee"],
            "sent to the house": ['passage'],
            "ordered returned to the house": ['passage'],
            "ordered to the senate": ['passage'],
            "signed by the governor": ['executive-signature'],
            "sent to the governor": ['executive-receipt'],
        }

        try:
            votes_link = page.xpath("//a[text() = 'Votes']")[0]
            yield from self.scrape_votes(bill, votes_link.attrib['href'])
        except IndexError:
            # Some bills don't have any votes
            pass

        for action in actions:
            date, chamber, page, text = [x.text for x in action.xpath(".//td")]
            session_year = self.jurisdiction.legislative_sessions[-1]['start_date'][0:4]
            # Session is April -> June. Prefiles look like they're in
            # January at earliest.
            date += '/{}'.format(session_year)
            date = dt.datetime.strptime(date, '%m/%d/%Y')
            chamber = self._chambers[chamber]

            cat = []
            for flag in flags:
                if flag in text.lower():
                    cat += flags[flag]

            bill.add_action(description=text,
                            date=date.strftime('%Y-%m-%d'),
                            chamber=chamber,
                            classification=cat)

        yield bill
Ejemplo n.º 18
0
    def scrape_bill(self, session, history_url):
        history_xml = self.get(history_url).text
        root = etree.fromstring(history_xml)

        bill_title = root.findtext("caption")
        if (bill_title is None or
                "Bill does not exist" in history_xml):
            self.warning("Bill does not appear to exist")
            return
        bill_id = ' '.join(root.attrib['bill'].split(' ')[1:])

        chamber = self.CHAMBERS[bill_id[0]]

        if bill_id[1] == 'B':
            bill_type = ['bill']
        elif bill_id[1] == 'R':
            bill_type = ['resolution']
        elif bill_id[1:3] == 'CR':
            bill_type = ['concurrent resolution']
        elif bill_id[1:3] == 'JR':
            bill_type = ['joint resolution']
        else:
            raise ScrapeError("Invalid bill_id: %s" % bill_id)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=bill_title,
            classification=bill_type
        )

        bill.add_source(history_url)

        for subject in root.iterfind('subjects/subject'):
            bill.add_subject(subject.text.strip())

        versions = [x for x in self.versions if x[0] == bill_id]
        for version in versions:
            bill.add_version_link(
                note=self.NAME_SLUGS[version[1][-5]],
                url=version[1],
                media_type='text/html'
            )

        analyses = [x for x in self.analyses if x[0] == bill_id]
        for analysis in analyses:
            bill.add_document_link(
                note="Analysis ({})".format(self.NAME_SLUGS[analysis[1][-5]]),
                url=analysis[1],
                media_type='text/html'
            )

        fiscal_notes = [x for x in self.fiscal_notes if x[0] == bill_id]
        for fiscal_note in fiscal_notes:
            bill.add_document_link(
                note="Fiscal Note ({})".format(self.NAME_SLUGS
                                               [fiscal_note[1][-5]]),
                url=fiscal_note[1],
                media_type='text/html'
            )

        witnesses = [x for x in self.witnesses if x[0] == bill_id]
        for witness in witnesses:
            bill.add_document_link(
                note="Witness List ({})".format(self.NAME_SLUGS
                                                [witness[1][-5]]),
                url=witness[1],
                media_type='text/html'
            )

        for action in root.findall('actions/action'):
            act_date = datetime.datetime.strptime(action.findtext('date'),
                                                  "%m/%d/%Y").date()

            action_number = action.find('actionNumber').text
            actor = {'H': 'lower',
                     'S': 'upper',
                     'E': 'executive'}[action_number[0]]

            desc = action.findtext('description').strip()

            if desc == 'Scheduled for public hearing on . . .':
                self.warning("Skipping public hearing action with no date")
                continue

            introduced = False

            if desc == 'Amended':
                atype = 'amendment-passage'
            elif desc == 'Amendment(s) offered':
                atype = 'amendment-introduction'
            elif desc == 'Amendment amended':
                atype = 'amendment-amendment'
            elif desc == 'Amendment withdrawn':
                atype = 'amendment-withdrawal'
            elif desc == 'Passed' or desc == 'Adopted':
                atype = 'passage'
            elif re.match(r'^Received (by|from) the', desc):
                if 'Secretary of the Senate' not in desc:
                    atype = 'introduction'
                else:
                    atype = 'filing'
            elif desc.startswith('Sent to the Governor'):
                # But what if it gets lost in the mail?
                atype = 'executive-receipt'
            elif desc.startswith('Signed by the Governor'):
                atype = 'executive-signature'
            elif desc == 'Vetoed by the Governor':
                atype = 'executive-veto'
            elif desc == 'Read first time':
                atype = ['introduction', 'reading-1']
                introduced = True
            elif desc == 'Read & adopted':
                atype = ['passage']
                if not introduced:
                    introduced = True
                    atype.append('introduction')
            elif desc == "Passed as amended":
                atype = 'passage'
            elif (desc.startswith('Referred to') or
                    desc.startswith("Recommended to be sent to ")):
                atype = 'referral-committee'
            elif desc == "Reported favorably w/o amendment(s)":
                atype = 'committee-passage'
            elif desc == "Filed":
                atype = 'filing'
            elif desc == 'Read 3rd time':
                atype = 'reading-3'
            elif desc == 'Read 2nd time':
                atype = 'reading-2'
            elif desc.startswith('Reported favorably'):
                atype = 'committee-passage-favorable'
            else:
                atype = None

            act = bill.add_action(
                action.findtext('description'),
                act_date,
                chamber=actor,
                classification=atype
            )

            if atype and 'referral-committee' in atype:
                repls = [
                    'Referred to',
                    "Recommended to be sent to "
                ]
                ctty = desc
                for r in repls:
                    ctty = ctty.replace(r, "").strip()
                act.add_related_entity(name=ctty, entity_type='organization')

        for author in root.findtext('authors').split(' | '):
            if author != "":
                bill.add_sponsorship(author, classification='primary',
                                     entity_type='person', primary=True)
        for coauthor in root.findtext('coauthors').split(' | '):
            if coauthor != "":
                bill.add_sponsorship(coauthor, classification='cosponsor',
                                     entity_type='person', primary=False)
        for sponsor in root.findtext('sponsors').split(' | '):
            if sponsor != "":
                bill.add_sponsorship(sponsor, classification='primary',
                                     entity_type='person', primary=True)
        for cosponsor in root.findtext('cosponsors').split(' | '):
            if cosponsor != "":
                bill.add_sponsorship(cosponsor, classification='cosponsor',
                                     entity_type='person', primary=False)

        if root.findtext('companions'):
            self._get_companion(bill)

        yield bill
Ejemplo n.º 19
0
    def scrape_bill(self, bill_num, session):
        chamber_map = {"House": "lower", "Senate": "upper", "LSO": "executive"}
        # Sample with all keys: https://gist.github.com/showerst/d6cd03eff3e8b12ab01dbb219876db45
        bill_json_url = ("http://wyoleg.gov/LsoService/api/BillInformation/{}/"
                         "{}?calendarDate=".format(session, bill_num))
        response = self.get(bill_json_url)
        bill_json = json.loads(response.content.decode("utf-8"))

        chamber = "lower" if bill_json["bill"][0] else "upper"

        bill = Bill(
            identifier=bill_json["bill"],
            legislative_session=session,
            title=bill_json["catchTitle"],
            chamber=chamber,
            classification="bill",
        )

        bill.add_title(bill_json["billTitle"])

        source_url = "http://lso.wyoleg.gov/Legislation/{}/{}".format(
            session, bill_json["bill"])
        bill.add_source(source_url)

        for action_json in bill_json["billActions"]:
            utc_action_date = self.parse_local_date(action_json["statusDate"])

            actor = None
            if action_json["location"] and action_json[
                    "location"] in chamber_map:
                actor = chamber_map[action_json["location"]]

            action = bill.add_action(
                chamber=actor,
                description=action_json["statusMessage"],
                date=utc_action_date,
                classification=categorize_action(action_json["statusMessage"]),
            )

            action.extras = {
                "billInformationID": action_json["billInformationID"]
            }

        if bill_json["introduced"]:
            url = "http://wyoleg.gov/{}".format(bill_json["introduced"])

            bill.add_version_link(
                note="Introduced",
                url=url,
                media_type="application/pdf",  # optional but useful!
            )

        if bill_json["enrolledAct"]:
            url = "http://wyoleg.gov/{}".format(bill_json["enrolledAct"])

            bill.add_version_link(
                note="Enrolled",
                url=url,
                media_type="application/pdf",  # optional but useful!
            )

        if bill_json["fiscalNote"]:
            url = "http://wyoleg.gov/{}".format(bill_json["fiscalNote"])

            bill.add_document_link(
                note="Fiscal Note",
                url=url,
                media_type="application/pdf",  # optional but useful!
            )

        if bill_json["digest"]:
            url = "http://wyoleg.gov/{}".format(bill_json["digest"])

            bill.add_document_link(
                note="Bill Digest",
                url=url,
                media_type="application/pdf",  # optional but useful!
            )

        if bill_json["vetoes"]:
            for veto in bill_json["vetoes"]:
                url = "http://wyoleg.gov/{}".format(veto["vetoLinkPath"])
                bill.add_version_link(
                    note=veto["vetoLinkText"],
                    url=url,
                    media_type="application/pdf",  # optional but useful!
                )

        for amendment in bill_json["amendments"]:
            # http://wyoleg.gov/2018/Amends/SF0050H2001.pdf
            url = "http://wyoleg.gov/{}/Amends/{}.pdf".format(
                session, amendment["amendmentNumber"])

            if amendment["sponsor"] and amendment["status"]:
                title = "Amendment {} ({}) - {} ({})".format(
                    amendment["amendmentNumber"],
                    amendment["order"],
                    amendment["sponsor"],
                    amendment["status"],
                )
            else:
                title = "Amendment {} ({})".format(
                    amendment["amendmentNumber"], amendment["order"])
            # add versions of the bill text
            version = bill.add_version_link(note=title,
                                            url=url,
                                            media_type="application/pdf")
            version["extras"] = {
                "amendmentNumber": amendment["amendmentNumber"],
                "sponsor": amendment["sponsor"],
            }

        for sponsor in bill_json["sponsors"]:
            status = "primary" if sponsor["primarySponsor"] else "cosponsor"
            sponsor_type = "person" if sponsor[
                "sponsorTitle"] else "organization"
            bill.add_sponsorship(
                name=sponsor["name"],
                classification=status,
                entity_type=sponsor_type,
                primary=sponsor["primarySponsor"],
            )

        if bill_json["summary"]:
            bill.add_abstract(note="summary", abstract=bill_json["summary"])

        if bill_json["enrolledNumber"]:
            bill.extras["wy_enrolled_number"] = bill_json["enrolledNumber"]

        if bill_json["chapter"]:
            bill.extras["chapter"] = bill_json["chapter"]

        if bill_json["effectiveDate"]:
            eff = datetime.datetime.strptime(bill_json["effectiveDate"],
                                             "%m/%d/%Y")
            bill.extras["effective_date"] = eff.strftime("%Y-%m-%d")

        bill.extras["wy_bill_id"] = bill_json["id"]

        for vote_json in bill_json["rollCalls"]:
            yield from self.scrape_vote(bill, vote_json, session)

        yield bill
Ejemplo n.º 20
0
    def scrape_bill(self, session, bill_url):
        page = self.get(bill_url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(bill_url)

        try:
            bill_id = page.xpath('//span[@id="lblBillNumber"]/a[1]')[0].text
        except IndexError:
            self.logger.warning("Something is wrong with bill page, skipping.")
            return
        secondary_bill_id = page.xpath('//span[@id="lblCompNumber"]/a[1]')

        # checking if there is a matching bill
        if secondary_bill_id:
            secondary_bill_id = secondary_bill_id[0].text
            # swap ids if * is in secondary_bill_id
            if '*' in secondary_bill_id:
                bill_id, secondary_bill_id = secondary_bill_id, bill_id
                secondary_bill_id = secondary_bill_id.strip()
            secondary_bill_id = secondary_bill_id.replace('  ', ' ')

        bill_id = bill_id.replace('*', '').replace('  ', ' ').strip()

        if 'B' in bill_id:
            bill_type = 'bill'
        elif 'JR' in bill_id:
            bill_type = 'joint resolution'
        elif 'R' in bill_id:
            bill_type = 'resolution'

        primary_chamber = 'lower' if 'H' in bill_id else 'upper'
        # secondary_chamber = 'upper' if primary_chamber == 'lower' else 'lower'

        title = page.xpath("//span[@id='lblAbstract']")[0].text
        if title is None:
            msg = '%s detail page was missing title info.'
            self.logger.warning(msg % bill_id)
            return

        # bill subject
        subject_pos = title.find('-')
        subjects = [s.strip() for s in title[:subject_pos - 1].split(',')]
        subjects = filter(None, subjects)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=primary_chamber,
            title=title,
            classification=bill_type,
        )
        for subject in subjects:
            bill.add_subject(subject)

        if secondary_bill_id:
            bill.add_identifier(secondary_bill_id)

        bill.add_source(bill_url)

        # Primary Sponsor
        sponsor = page.xpath("//span[@id='lblBillPrimeSponsor']")[0].text_content().split("by")[-1]
        sponsor = sponsor.replace('*', '').strip()
        if sponsor:
            bill.add_sponsorship(
                sponsor,
                classification='primary',
                entity_type='person',
                primary=True,
            )

        # bill text
        btext = page.xpath("//span[@id='lblBillNumber']/a")[0]
        bill.add_version_link('Current Version', btext.get('href'),
                              media_type='application/pdf')

        # documents
        summary = page.xpath('//a[contains(@href, "BillSummaryArchive")]')
        if summary:
            bill.add_document_link('Summary', summary[0].get('href'))
        fiscal = page.xpath('//span[@id="lblFiscalNote"]//a')
        if fiscal:
            bill.add_document_link('Fiscal Note', fiscal[0].get('href'))
        amendments = page.xpath('//a[contains(@href, "/Amend/")]')
        for amendment in amendments:
            bill.add_document_link('Amendment ' + amendment.text, amendment.get('href'))
        # amendment notes in image with alt text describing doc inside <a>
        amend_fns = page.xpath('//img[contains(@alt, "Fiscal Memo")]')
        for afn in amend_fns:
            bill.add_document_link(
                afn.get('alt'),
                afn.getparent().get('href'),
                on_duplicate='ignore'
            )

        # actions
        atable = page.xpath("//table[@id='gvBillActionHistory']")[0]
        actions_from_table(bill, atable)

        # if there is a matching bill
        if secondary_bill_id:
            # secondary sponsor
            secondary_sponsor = page.xpath(
                "//span[@id='lblCompPrimeSponsor']")[0].text_content().split("by")[-1]
            secondary_sponsor = secondary_sponsor.replace('*', '').replace(')', '').strip()
            # Skip black-name sponsors.
            if secondary_sponsor:
                bill.add_sponsorship(
                    secondary_sponsor,
                    classification='primary',
                    entity_type='person',
                    primary=True,
                )

            # secondary actions
            cotable = page.xpath("//table[@id='gvCoActionHistory']")[0]
            actions_from_table(bill, cotable)

        # votes
        yield from self.scrape_vote_events(bill, page, bill_url)

        bill.actions.sort(key=lambda a: a['date'])
        yield bill
Ejemplo n.º 21
0
    def scrape_bill(self, bill_num, session):
        chamber_map = {'House': 'lower', 'Senate': 'upper', 'LSO': 'executive'}
        # Sample with all keys: https://gist.github.com/showerst/d6cd03eff3e8b12ab01dbb219876db45
        bill_json_url = 'http://wyoleg.gov/LsoService/api/BillInformation/{}/' \
                        '{}?calendarDate='.format(
                            session, bill_num)
        response = self.get(bill_json_url)
        bill_json = json.loads(response.content.decode('utf-8'))

        chamber = 'lower' if bill_json['bill'][0] else 'upper'

        bill = Bill(identifier=bill_json['bill'],
                    legislative_session=session,
                    title=bill_json['catchTitle'],
                    chamber=chamber,
                    classification="bill",
                    )

        bill.add_title(bill_json['billTitle'])

        source_url = 'http://lso.wyoleg.gov/Legislation/{}/{}'.format(session,
                                                                      bill_json['bill'])
        bill.add_source(source_url)

        for action_json in bill_json['billActions']:
            utc_action_date = self.parse_local_date(action_json['statusDate'])

            actor = None
            if action_json['location'] and action_json['location'] in chamber_map:
                actor = chamber_map[action_json['location']]

            action = bill.add_action(
                chamber=actor,
                description=action_json['statusMessage'],
                date=utc_action_date,
                classification=categorize_action(action_json['statusMessage']),
            )

            action.extras = {
                'billInformationID': action_json['billInformationID']}

        if bill_json['introduced']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['introduced'])

            bill.add_version_link(note="Introduced",
                                  url=url,
                                  media_type="application/pdf"  # optional but useful!
                                  )

        if bill_json['enrolledAct']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['enrolledAct'])

            bill.add_version_link(note="Enrolled",
                                  url=url,
                                  media_type="application/pdf"  # optional but useful!
                                  )

        if bill_json['fiscalNote']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['fiscalNote'])

            bill.add_document_link(note="Fiscal Note",
                                   url=url,
                                   media_type="application/pdf"  # optional but useful!
                                   )

        if bill_json['digest']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['digest'])

            bill.add_document_link(note="Bill Digest",
                                   url=url,
                                   media_type="application/pdf"  # optional but useful!
                                   )

        if bill_json['vetoes']:
            for veto in bill_json['vetoes']:
                url = 'http://wyoleg.gov/{}'.format(veto['vetoLinkPath'])
                bill.add_version_link(note=veto['vetoLinkText'],
                                      url=url,
                                      media_type="application/pdf"  # optional but useful!
                                      )

        for amendment in bill_json['amendments']:
            # http://wyoleg.gov/2018/Amends/SF0050H2001.pdf
            url = 'http://wyoleg.gov/{}/Amends/{}.pdf'.format(
                session, amendment['amendmentNumber'])

            if amendment['sponsor'] and amendment['status']:
                title = 'Amendment {} ({}) - {} ({})'.format(
                    amendment['amendmentNumber'],
                    amendment['order'],
                    amendment['sponsor'],
                    amendment['status'],
                )
            else:
                title = 'Amendment {} ({})'.format(
                    amendment['amendmentNumber'],
                    amendment['order'],
                )
            # add versions of the bill text
            version = bill.add_version_link(
                note=title,
                url=url,
                media_type="application/pdf",
            )
            version['extras'] = {
                'amendmentNumber': amendment['amendmentNumber'],
                'sponsor': amendment['sponsor'],
            }

        for sponsor in bill_json['sponsors']:
            status = 'primary' if sponsor['primarySponsor'] else 'cosponsor'
            sponsor_type = 'person' if sponsor['sponsorTitle'] else 'organization'
            bill.add_sponsorship(
                name=sponsor['name'],
                classification=status,
                entity_type=sponsor_type,
                primary=sponsor['primarySponsor']
            )

        if bill_json['summary']:
            bill.add_abstract(
                note="summary",
                abstract=bill_json['summary'],
            )

        if bill_json['enrolledNumber']:
            bill.extras['wy_enrolled_number'] = bill_json['enrolledNumber']

        if bill_json['chapter']:
            bill.extras['chapter'] = bill_json['chapter']

        if bill_json['effectiveDate']:
            eff = datetime.datetime.strptime(
                bill_json['effectiveDate'], '%m/%d/%Y')
            bill.extras['effective_date'] = eff.strftime('%Y-%m-%d')

        bill.extras['wy_bill_id'] = bill_json['id']

        for vote_json in bill_json['rollCalls']:
            yield from self.scrape_vote(bill, vote_json, session)

        yield bill
Ejemplo n.º 22
0
    def scrape_events_range(self, start_date, end_date):

        def daterange(start_date, end_date):
            number_of_days = int((end_date - start_date).days)
            for n in range(number_of_days):
                yield start_date + dt.timedelta(n)

        for date in daterange(start_date, end_date):
            events = self.extract_events_by_day(date)
            for event in events:
                tz = pytz.timezone("America/Toronto")
                time = dt.datetime.strptime(event['time'], '%I:%M %p')
                start = tz.localize(date.replace(hour=time.hour, minute=time.minute, second=0, microsecond=0))
                source_url = CALENDAR_DAY_TEMPLATE.format(start.year, start.month, start.day)
                org_name = event['meeting']
                e = Event(
                    name = org_name,
                    start_time = start,
                    timezone = tz.zone,
                    location_name = event['location'],
                    status=STATUS_DICT.get(event['meeting_status'])
                    )
                e.add_source(source_url)
                e.extras = {
                    'meeting_number': event['no'],
                    'tmmis_meeting_id': event['meeting_id'],
                    }
                e.add_participant(
                    name = org_name,
                    type = 'organization',
                    )

                def is_agenda_available(event):
                    return event['publishing_status'] in ['Agenda Published', 'Minutes Published']

                def is_council(event):
                    return True if event['meeting'] == self.jurisdiction.name else False

                if is_agenda_available(event):
                    template = AGENDA_FULL_COUNCIL_TEMPLATE if is_council(event) else AGENDA_FULL_STANDARD_TEMPLATE
                    agenda_url = template.format(event['meeting_id'])
                    full_identifiers = list(self.full_identifiers(event['meeting_id'], is_council(event)))

                    e.add_source(agenda_url)
                    agenda_items = self.agenda_from_url(agenda_url)
                    for i, item in enumerate(agenda_items):

                        a = e.add_agenda_item(item['title'])
                        a.add_classification(item['type'].lower())
                        a['order'] = str(i)

                        def normalize_wards(raw):
                            if not raw: raw = 'All'
                            if raw == 'All':
                                return raw.lower()
                            else:
                                return raw.split(', ')

                        wards = normalize_wards(item['wards'])
                        identifier_regex = re.compile(r'^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$')
                        [full_identifier] = [id for id in full_identifiers if identifier_regex.match(id).group(1) == item['identifier']]
                        a.add_bill(full_identifier)
                        if full_identifier not in self.seen_agenda_items:
                            b = Bill(
                                # TODO: Fix this hardcode
                                legislative_session = '2014-2018',
                                identifier = full_identifier,
                                title = item['title'],
                                from_organization = {'name': self.jurisdiction.name},
                                )
                            b.add_source(agenda_url)
                            b.add_document_link(note='canonical', media_type='text/html', url=AGENDA_ITEM_TEMPLATE.format(full_identifier))
                            b.extras = {
                                'wards': wards,
                                }

                            self.seen_agenda_items.append(full_identifier)

                            yield b

                yield e
    def scrape_bill(self, chamber, session, bill_id, bill_type, url):
        doc = lxml.html.fromstring(self.get(url).text)
        doc.make_links_absolute(url)

        title = doc.xpath('//b[text()="TITLE:"]')
        if title:
            title = title[0].tail.strip().strip('"')
        else:
            self.warning("skipping bill %s, no information" % url)
            return

        bill = Bill(
            bill_id,
            title=title,
            chamber=chamber,
            classification=bill_type,
            legislative_session=session,
        )
        bill.add_source(url)

        # Get sponsors
        spons_str = doc.xpath(
            '//b[contains(text(), "SPONSOR")]')[0].tail.strip()
        sponsors_match = re.match(
            '(SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})', spons_str)
        if sponsors_match:
            sponsors = sponsors_match.group(2).split(',')
            sponsor = sponsors[0].strip()

            if sponsor:
                bill.add_sponsorship(
                    sponsors[0],
                    entity_type='person',
                    classification='primary',
                    primary=True,
                )

            for sponsor in sponsors[1:]:
                sponsor = sponsor.strip()
                if sponsor:
                    bill.add_sponsorship(
                        sponsor,
                        entity_type='person',
                        classification='cosponsor',
                        primary=False,
                    )
        else:
            # Committee sponsorship
            spons_str = spons_str.strip()

            if re.match(r' BY REQUEST OF THE GOVERNOR$', spons_str):
                spons_str = re.sub(r' BY REQUEST OF THE GOVERNOR$', '',
                                   spons_str).title()
                spons_str = (spons_str +
                             " Committee (by request of the governor)")

            if spons_str:
                bill.add_sponsorship(
                    spons_str,
                    entity_type='person',
                    classification='primary',
                    primary=True,
                )

        # Get actions from second myth table
        self._current_comm = None
        act_rows = doc.xpath('(//table[@class="myth"])[2]//tr')[1:]
        for row in act_rows:
            date, journal, raw_chamber, action = row.xpath('td')

            act_date = datetime.datetime.strptime(date.text_content().strip(),
                                                  '%m/%d/%y')
            raw_chamber = raw_chamber.text_content().strip()
            action = action.text_content().strip()

            if raw_chamber == "(H)":
                act_chamber = "lower"
            elif raw_chamber == "(S)":
                act_chamber = "upper"

            if re.match("\w+ Y(\d+)", action):
                vote_href = journal.xpath('.//a/@href')
                if vote_href:
                    yield from self.parse_vote(bill, action, act_chamber,
                                               act_date, vote_href[0])

            action, atype = self.clean_action(action)

            match = re.match('^Prefile released (\d+/\d+/\d+)$', action)
            if match:
                action = 'Prefile released'
                act_date = datetime.datetime.strptime(match.group(1),
                                                      '%m/%d/%y')

            bill.add_action(action,
                            chamber=act_chamber,
                            date=act_date.strftime('%Y-%m-%d'),
                            classification=atype)

        # Get subjects
        for subj in doc.xpath('//a[contains(@href, "subject")]/text()'):
            bill.add_subject(subj.strip())

        # Get versions
        text_list_url = ("http://www.legis.state.ak.us/"
                         "basis/get_fulltext.asp?session=%s&bill=%s") % (
                             session, bill_id)
        bill.add_source(text_list_url)

        text_doc = lxml.html.fromstring(self.get(text_list_url).text)
        text_doc.make_links_absolute(text_list_url)
        for link in text_doc.xpath('//a[contains(@href, "get_bill_text")]'):
            name = link.xpath('../preceding-sibling::td/text()')[0].strip()
            text_url = link.get('href')
            bill.add_version_link(name, text_url, media_type="text/html")

        # Get documents
        doc_list_url = ("http://www.legis.state.ak.us/"
                        "basis/get_documents.asp?session=%s&bill=%s") % (
                            session, bill_id)
        doc_list = lxml.html.fromstring(self.get(doc_list_url).text)
        doc_list.make_links_absolute(doc_list_url)
        bill.add_source(doc_list_url)
        for href in doc_list.xpath(
                '//a[contains(@href, "get_documents")][@onclick]'):
            h_name = href.text_content()
            h_href = href.attrib['href']
            if h_name.strip():
                bill.add_document_link(h_name, h_href)

        yield bill
Ejemplo n.º 24
0
    def _parse_house_bill(self, url, session):
        # using the print page makes the page simpler, and also *drastically* smaller
        # (8k rather than 100k)
        url = re.sub("billsummary", "billsummaryprn", url)
        url = '%s/%s' % (self._house_base_url, url)

        # the URL is an iframed version now, so swap in for the actual bill page

        url = url.replace('Bill.aspx', 'BillContent.aspx')
        url = url.replace('&code=R', '&code=R&style=new')

        # http://www.house.mo.gov/Bill.aspx?bill=HB26&year=2017&code=R
        # http://www.house.mo.gov/BillContent.aspx?bill=HB26&year=2017&code=R&style=new

        bill_page = self.get(url).text
        bill_page = lxml.html.fromstring(bill_page)
        bill_page.make_links_absolute(url)

        bill_id = bill_page.xpath('//*[@class="entry-title"]/div')
        if len(bill_id) == 0:
            self.info("WARNING: bill summary page is blank! (%s)" % url)
            self._bad_urls.append(url)
            return
        bill_id = bill_id[0].text_content()
        bill_id = clean_text(bill_id)

        bill_desc = bill_page.xpath('//*[@class="BillDescription"]')[0].text_content()
        bill_desc = clean_text(bill_desc)

        table_rows = bill_page.xpath('//table/tr')
        # if there is a cosponsor all the rows are pushed down one for the extra row
        # for the cosponsor:
        cosponsorOffset = 0
        if table_rows[2][0].text_content().strip() == 'Co-Sponsor:':
            cosponsorOffset = 1

        lr_label_tag = table_rows[3 + cosponsorOffset]
        assert lr_label_tag[0].text_content().strip() == 'LR Number:'
        # bill_lr = lr_label_tag[1].text_content()

        lastActionOffset = 0
        if table_rows[4 + cosponsorOffset][0].text_content().strip() == 'Governor Action:':
            lastActionOffset = 1
        official_title_tag = table_rows[5 + cosponsorOffset + lastActionOffset]
        assert official_title_tag[0].text_content().strip() == 'Bill String:'
        official_title = official_title_tag[1].text_content()

        # could substitute the description for the name,
        # but keeping it separate for now.

        bill_type = "bill"
        triplet = bill_id[:3]

        if triplet in bill_types:
            bill_type = bill_types[triplet]
            bill_number = int(bill_id[3:].strip())
        else:
            bill_number = int(bill_id[3:])

        subs = []
        bid = bill_id.replace(" ", "")

        if bid in self._subjects:
            subs = self._subjects[bid]
            self.info("With subjects for this bill")

        self.info(bid)

        if bill_desc == "":
            if bill_number <= 20:
                # blank bill titles early in session are approp. bills
                bill_desc = 'Appropriations Bill'
            else:
                self.error("Blank title. Skipping. {} / {} / {}".format(
                    bill_id, bill_desc, official_title
                ))
                return

        bill = Bill(
            bill_id,
            chamber='lower',
            title=bill_desc,
            legislative_session=self._session_id,
            classification=bill_type,
        )
        bill.subject = subs
        bill.add_title(official_title, note='official')

        bill.add_source(url)

        bill_sponsor = clean_text(table_rows[0][1].text_content())
        # try:
        #     bill_sponsor_link = table_rows[0][1][0].attrib['href']
        # except IndexError:
        #     return
        bill.add_sponsorship(
            bill_sponsor,
            entity_type='person',
            classification='primary',
            primary=True,
        )

        # check for cosponsors
        sponsors_url, = bill_page.xpath(
            "//a[contains(@href, 'CoSponsors.aspx')]/@href")
        self._parse_cosponsors_from_bill(bill, sponsors_url)

        # actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0]
        # actions_link = '%s/%s' % (self._house_base_url,actions_link_tag.attrib['href'])
        # actions_link = re.sub("content", "print", actions_link)

        actions_link, = bill_page.xpath(
            "//a[contains(@href, 'BillActions.aspx')]/@href")
        yield from self._parse_house_actions(bill, actions_link)

        # get bill versions
        doc_tags = bill_page.xpath('//div[@class="BillDocuments"][1]/span')
        for doc_tag in reversed(doc_tags):
            doc = clean_text(doc_tag.text_content())
            text_url = '%s%s' % (
                self._house_base_url,
                doc_tag[0].attrib['href']
            )
            bill.add_document_link(doc, text_url, media_type='text/html')

        # get bill versions
        version_tags = bill_page.xpath('//div[@class="BillDocuments"][2]/span')
        for version_tag in reversed(version_tags):
            version = clean_text(version_tag.text_content())
            for vurl in version_tag.xpath(".//a"):
                if vurl.text == 'PDF':
                    mimetype = 'application/pdf'
                else:
                    mimetype = 'text/html'
                bill.add_version_link(version, vurl.attrib['href'], media_type=mimetype,
                                      on_duplicate='ignore')

        # house bill versions
        # everything between the row containing "Bill Text"" and the next div.DocHeaderRow
        version_rows = bill_page.xpath(
            '//div[contains(text(),"Bill Text")]/'
            'following-sibling::div[contains(@class,"DocRow") '
            'and count(preceding-sibling::div[contains(@class,"DocHeaderRow")])=1]')
        for row in version_rows:
            # some rows are just broken links, not real versions
            if row.xpath('.//div[contains(@class,"textType")]/a/@href'):
                version = row.xpath('.//div[contains(@class,"textType")]/a/text()')[0].strip()
                path = row.xpath('.//div[contains(@class,"textType")]/a/@href')[0].strip()
                if '.pdf' in path:
                    mimetype = 'application/pdf'
                else:
                    mimetype = 'text/html'
                bill.add_version_link(version, path, media_type=mimetype,
                                      on_duplicate='ignore')

        # house bill summaries
        # everything between the row containing "Bill Summary"" and the next div.DocHeaderRow
        summary_rows = bill_page.xpath(
            '//div[contains(text(),"Bill Summary")]/'
            'following-sibling::div[contains(@class,"DocRow") '
            'and count(following-sibling::div[contains(@class,"DocHeaderRow")])=1]')

        # if there are no amedments, we need a different xpath for summaries
        if not summary_rows:
            summary_rows = bill_page.xpath(
                '//div[contains(text(),"Bill Summary")]/'
                'following-sibling::div[contains(@class,"DocRow")]')

        for row in reversed(summary_rows):
            version = row.xpath('.//div[contains(@class,"textType")]/a/text()')[0].strip()
            if version:
                path = row.xpath('.//div[contains(@class,"textType")]/a/@href')[0].strip()
                summary_name = 'Bill Summary ({})'.format(version)
                if '.pdf' in path:
                    mimetype = 'application/pdf'
                else:
                    mimetype = 'text/html'
                bill.add_document_link(summary_name, path, media_type=mimetype,
                                       on_duplicate='ignore')

        # house bill amendments
        amendment_rows = bill_page.xpath('//div[contains(text(),"Amendment")]/'
                                         'following-sibling::div[contains(@class,"DocRow")]')

        for row in reversed(amendment_rows):
            version = row.xpath('.//div[contains(@class,"DocInfoCell")]/a[1]/text()')[0].strip()
            path = row.xpath('.//div[contains(@class,"DocInfoCell")]/a[1]/@href')[0].strip()
            summary_name = 'Amendment {}'.format(version)

            defeated_icon = row.xpath('.//img[contains(@title,"Defeated")]')
            if defeated_icon:
                summary_name = '{} (Defeated)'.format(summary_name)

            adopted_icon = row.xpath('.//img[contains(@title,"Adopted")]')
            if adopted_icon:
                summary_name = '{} (Adopted)'.format(summary_name)

            distributed_icon = row.xpath('.//img[contains(@title,"Distributed")]')
            if distributed_icon:
                summary_name = '{} (Distributed)'.format(summary_name)

            if '.pdf' in path:
                mimetype = 'application/pdf'
            else:
                mimetype = 'text/html'
            bill.add_version_link(summary_name, path, media_type=mimetype,
                                  on_duplicate='ignore')

        yield bill
Ejemplo n.º 25
0
    def bill_info(self, bill_link, session, main_url):
        bill_page = self.lxmlize(bill_link)

        long_title = self.get_node(
            bill_page,
            '//div[@class="main-content"]/div[1]/div/h2').text.split()

        bill_number = long_title[0]
        title = ''
        for x in range(2, len(long_title)):
            title += long_title[x] + ' '
        title = title[0:-1]

        if not title:
            self.error('no title, skipping %s', bill_number)
            return

        bill_type = 'resolution' if 'LR' in bill_number else 'bill'

        bill = Bill(bill_number, session, title, classification=bill_type)

        bill.add_source(main_url)
        bill.add_source(bill_link)

        introduced_by = self.get_node(
            bill_page,
            '//div[@class="main-content"]/div[3]/div[1]/ul/li[1]/a[1]/text()')

        if not introduced_by:
            introduced_by = self.get_node(
                bill_page,
                '//div[@class="main-content"]/div[3]/div[1]/ul/li[1]/text()')
            introduced_by = introduced_by.split('Introduced By:')[1].strip()

        bill.add_sponsorship(
            name=introduced_by,
            entity_type='person',
            primary=True,
            classification='primary',
        )

        action_nodes = self.get_nodes(
            bill_page,
            '//div[@class="main-content"]/div[5]//table/tbody/tr')

        for action_node in action_nodes:
            date = self.get_node(
                action_node,
                './td[1]').text
            date = datetime.strptime(date, '%b %d, %Y')

            # The action node may have an anchor element within it, so
            # we grab all the text within.
            action = self.get_node(
                action_node,
                './td[2]').text_content()

            if 'Governor' in action:
                actor = 'executive'
            elif 'Speaker' in action:
                actor = 'legislature'
            else:
                actor = 'legislature'

            action_type = self.action_types(action)
            bill.add_action(
                action,
                date.strftime('%Y-%m-%d'),
                chamber=actor,
                classification=action_type,
            )

        # Were in reverse chronological order.
        bill.actions.reverse()

        # Grabs bill version documents.
        version_links = self.get_nodes(
            bill_page,
            '//div[@class="main-content"]/div[3]/div[2]/'
            'div[@class="hidden-xs"]/ul[1]/li/a')

        for version_link in version_links:
            version_name = version_link.text
            version_url = version_link.attrib['href']
            # replace Current w/ session number
            version_url = version_url.replace('Current', session)
            bill.add_version_link(version_name, version_url, media_type='application/pdf')

        # Adds any documents related to amendments.
        amendment_links = self.get_nodes(
            bill_page,
            '//div[@class="main-content"]/div[5]/div[2]/table/tr/td[1]/a')

        for amendment_link in amendment_links:
            amendment_name = amendment_link.text
            amendment_url = amendment_link.attrib['href']
            bill.add_document_link(amendment_name, amendment_url)

        # Related transcripts.
        transcript_links = self.get_nodes(
            bill_page,
            '//div[@class="main-content"]/div[5]/div[2]/'
            'div[@class="hidden-xs"]/table/tr/td/a')

        for transcript_link in transcript_links:
            transcript_name = transcript_link.text
            transcript_url = transcript_link.attrib['href']
            bill.add_document_link(transcript_name, transcript_url)

        yield bill

        yield from self.scrape_votes(bill, bill_page, actor)
Ejemplo n.º 26
0
    def bill_info(self, bill_link, session, main_url):
        bill_page = self.lxmlize(bill_link)

        long_title = self.get_node(
            bill_page,
            '//div[@class="main-content"]/div[1]/div/h2').text.split()

        bill_number = long_title[0]
        title = ''
        for x in range(2, len(long_title)):
            title += long_title[x] + ' '
        title = title[0:-1]

        if not title:
            self.error('no title, skipping %s', bill_number)
            return

        bill_type = 'resolution' if 'LR' in bill_number else 'bill'

        bill = Bill(bill_number, session, title, classification=bill_type)

        bill.add_source(main_url)
        bill.add_source(bill_link)

        introduced_by = self.get_node(
            bill_page,
            '//div[@class="main-content"]/div[3]/div[1]/ul/li[1]/a[1]/text()')

        if not introduced_by:
            introduced_by = self.get_node(
                bill_page,
                '//div[@class="main-content"]/div[3]/div[1]/ul/li[1]/text()')
            introduced_by = introduced_by.split('Introduced By:')[1].strip()

        bill.add_sponsorship(
            name=introduced_by,
            entity_type='person',
            primary=True,
            classification='primary',
        )

        action_nodes = self.get_nodes(
            bill_page,
            '//div[@class="main-content"]/div[5]//table/tbody/tr')

        for action_node in action_nodes:
            date = self.get_node(
                action_node,
                './td[1]').text
            date = datetime.strptime(date, '%b %d, %Y')

            # The action node may have an anchor element within it, so
            # we grab all the text within.
            action = self.get_node(
                action_node,
                './td[2]').text_content()

            if 'Governor' in action:
                actor = 'executive'
            elif 'Speaker' in action:
                actor = 'legislature'
            else:
                actor = 'legislature'

            action_type = self.action_types(action)
            bill.add_action(
                action,
                date.strftime('%Y-%m-%d'),
                chamber=actor,
                classification=action_type,
            )

        # Were in reverse chronological order.
        bill.actions.reverse()

        # Grabs bill version documents.
        version_links = self.get_nodes(
            bill_page,
            '//div[@class="main-content"]/div[3]/div[2]/'
            'div[@class="hidden-xs"]/ul[1]/li/a')

        for version_link in version_links:
            version_name = version_link.text
            version_url = version_link.attrib['href']
            # replace Current w/ session number
            version_url = version_url.replace('Current', session)
            bill.add_version_link(version_name, version_url, media_type='application/pdf')

        # Adds any documents related to amendments.
        amendment_links = self.get_nodes(
            bill_page,
            '//div[@class="main-content"]/div[5]/div[2]/table/tr/td[1]/a')

        for amendment_link in amendment_links:
            amendment_name = amendment_link.text
            amendment_url = amendment_link.attrib['href']
            bill.add_document_link(amendment_name, amendment_url)

        # Related transcripts.
        transcript_links = self.get_nodes(
            bill_page,
            '//div[@class="main-content"]/div[5]/div[2]/'
            'div[@class="hidden-xs"]/table/tr/td/a')

        for transcript_link in transcript_links:
            transcript_name = transcript_link.text
            transcript_url = transcript_link.attrib['href']
            bill.add_document_link(transcript_name, transcript_url)

        return bill
Ejemplo n.º 27
0
    def scrape_bills(self, chamber_to_scrape, session):
        url = 'http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml' % session

        bill_dir_page = self.get(url)
        root = lxml.etree.fromstring(bill_dir_page.content)
        for mr in root.xpath('//LASTACTION/MSRGROUP'):
            bill_id = mr.xpath('string(MEASURE)').replace(" ", "")
            if bill_id[0] == "S":
                chamber = "upper"
            else:
                chamber = "lower"

            bill_type = {'B': 'bill', 'C': 'concurrent resolution',
                         'R': 'resolution', 'N': 'nomination'}[bill_id[1]]

            # just skip past bills that are of the wrong chamber
            if chamber != chamber_to_scrape:
                continue

            link = mr.xpath('string(ACTIONLINK)').replace("..", "")
            main_doc = mr.xpath('string(MEASURELINK)').replace("../../../", "")
            main_doc_url = 'http://billstatus.ls.state.ms.us/%s' % main_doc
            bill_details_url = 'http://billstatus.ls.state.ms.us/%s/pdf%s' % (session, link)
            try:
                details_page = self.get(bill_details_url)
            except scrapelib.HTTPError:
                self.warning('Bill page not loading for {}; skipping'.format(bill_id))
                continue

            page = details_page.content
            # Some pages have the (invalid) byte 11 sitting around. Just drop
            # them out. Might as well.

            details_root = lxml.etree.fromstring(page)
            title = details_root.xpath('string(//SHORTTITLE)')
            longtitle = details_root.xpath('string(//LONGTITLE)')

            bill = Bill(bill_id,
                        legislative_session=session,
                        chamber=chamber,
                        title=title,
                        classification=bill_type)
            bill.extras['summary'] = longtitle
            bill.add_source(main_doc_url)
            # sponsors
            main_sponsor = details_root.xpath('string(//P_NAME)').split()
            if main_sponsor:
                main_sponsor = main_sponsor[0]
                main_sponsor_link = details_root.xpath('string(//P_LINK)').replace(" ", "_")
                main_sponsor_url = ('http://billstatus.ls.state.ms.us/%s/'
                                    'pdf/%s') % (session, main_sponsor_link.strip('../'))
                type = "primary"
                bill.add_source(main_sponsor_url)
                bill.add_sponsorship(main_sponsor,
                                     classification=type,
                                     entity_type='person',
                                     primary=True)

            for author in details_root.xpath('//AUTHORS/ADDITIONAL'):
                leg = author.xpath('string(CO_NAME)').replace(" ", "_")
                if leg:
                    leg_url = ('http://billstatus.ls.state.ms.us/%s/'
                               'pdf/House_authors/%s.xml') % (session, leg)
                    type = "cosponsor"
                    bill.add_source(leg_url)
                    bill.add_sponsorship(leg,
                                         classification=type,
                                         entity_type='person',
                                         primary=False
                                         )
            # Versions
            curr_version = details_root.xpath('string(//CURRENT_OTHER'
                                              ')').replace("../../../../", "")
            if curr_version != "":
                curr_version_url = "http://billstatus.ls.state.ms.us/" \
                        + curr_version
                bill.add_version_link("Current version", curr_version_url,
                                      on_duplicate="ignore",
                                      media_type="text/html"
                                      )

            intro_version = details_root.xpath('string(//INTRO_OTHER)').replace("../../../../", "")
            if intro_version != "":
                intro_version_url = "http://billstatus.ls.state.ms.us/"\
                        + intro_version
                bill.add_version_link("As Introduced", intro_version_url,
                                      on_duplicate='ignore',
                                      media_type='text/html')

            comm_version = details_root.xpath('string(//CMTESUB_OTHER'
                                              ')').replace("../../../../", "")
            if comm_version.find("documents") != -1:
                comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version
                bill.add_version_link("Committee Substitute", comm_version_url,
                                      on_duplicate='ignore',
                                      media_type='text/html')
            passed_version = details_root.xpath('string(//PASSED_OTHER'
                                                ')').replace("../../../../", "")
            if passed_version.find("documents") != -1:
                passed_version_url = "http://billstatus.ls.state.ms.us/" + passed_version
                title = "As Passed the " + chamber
                bill.add_version_link(title, passed_version_url,
                                      on_duplicate='ignore',
                                      media_type='text/html')

            asg_version = details_root.xpath('string(//ASG_OTHER)').replace("../../../../", "")
            if asg_version.find("documents") != -1:
                asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version
                bill.add_version_link("Approved by the Governor", asg_version_url,
                                      on_duplicate='ignore',
                                      media_type='text/html')

            # amendments
            # ex: http://billstatus.ls.state.ms.us/2018/pdf/history/HB/HB1040.xml
            for amd in details_root.xpath('//AMENDMENTS/*'):
                if amd.tag == 'HAM':
                    name = amd.xpath('HAM_DESC[1]/text()')[0]
                    name = append_parens(amd, 'HAM_DISP', name)
                    name = append_parens(amd, 'HAM_VDESC', name)

                    pdf_url = amd.xpath('string(HAM_PDF'
                                        ')').replace("../", "")

                    html_url = amd.xpath('string(HAM_OTHER'
                                         ')').replace("../", "")
                elif amd.tag == 'SAM':
                    name = amd.xpath('SAM_DESC[1]/text()')[0]
                    name = append_parens(amd, 'SAM_DISP', name)
                    name = append_parens(amd, 'SAM_VDESC', name)

                    pdf_url = amd.xpath('string(SAM_PDF'
                                        ')').replace("../", "")

                    html_url = amd.xpath('string(SAM_OTHER'
                                         ')').replace("../", "")
                elif amd.tag == 'AMRPT':
                    name = amd.xpath('AMRPT_DESC[1]/text()')[0]
                    pdf_url = amd.xpath('string(AMRPT_PDF'
                                        ')').replace("../", "")

                    html_url = amd.xpath('string(AMRPT_OTHER'
                                         ')').replace("../", "")

                pdf_url = 'http://billstatus.ls.state.ms.us/' + pdf_url
                html_url = 'http://billstatus.ls.state.ms.us/' + html_url

                if 'adopted' in name.lower() or 'amendment report' in name.lower():
                    bill.add_version_link(name, pdf_url,
                                          on_duplicate='ignore',
                                          media_type='application/pdf')
                    bill.add_version_link(name, html_url,
                                          on_duplicate='ignore',
                                          media_type='text/html')

            # avoid duplicate votes
            seen_votes = set()

            # Actions
            for action in details_root.xpath('//HISTORY/ACTION'):
                # action_num  = action.xpath('string(ACT_NUMBER)').strip()
                # action_num = int(action_num)
                act_vote = action.xpath('string(ACT_VOTE)').replace("../../../..", "")
                action_desc = action.xpath('string(ACT_DESC)')
                date, action_desc = action_desc.split(" ", 1)
                date = date + "/" + session[0:4]
                date = datetime.strptime(date, "%m/%d/%Y")

                if action_desc.startswith("(H)"):
                    actor = "lower"
                    action = action_desc[4:]
                elif action_desc.startswith("(S)"):
                    actor = "upper"
                    action = action_desc[4:]
                else:
                    actor = "executive"
                    action = action_desc

                if "Veto" in action and actor == 'executive':
                    version_path = details_root.xpath("string(//VETO_OTHER)")
                    version_path = version_path.replace("../../../../", "")
                    version_url = "http://billstatus.ls.state.ms.us/" + version_path
                    bill.add_document_link("Veto", version_url)

                atype = 'other'
                for prefix, prefix_type in self._action_types:
                    if action.startswith(prefix):
                        atype = prefix_type
                        break

                bill.add_action(action, self._tz.localize(date),
                                chamber=actor,
                                classification=atype if atype != 'other' else None)

                # use committee names as scraped subjects
                subjects = details_root.xpath('//H_NAME/text()')
                subjects += details_root.xpath('//S_NAME/text()')

                for subject in subjects:
                    if subject not in bill.subject:
                        bill.add_subject(subject)

                if act_vote:
                    vote_url = 'http://billstatus.ls.state.ms.us%s' % act_vote
                    if vote_url not in seen_votes:
                        seen_votes.add(vote_url)
                        yield from self.scrape_votes(vote_url, action,
                                                     date, actor, bill)

            bill.add_source(bill_details_url)
            yield bill
Ejemplo n.º 28
0
    def scrape_bill(self, chamber, session, bill_id, bill_type, url):
        doc = lxml.html.fromstring(self.get(url).text)
        doc.make_links_absolute(url)

        title = doc.xpath('//b[text()="TITLE:"]')
        if title:
            title = title[0].tail.strip().strip('"')
        else:
            self.warning("skipping bill %s, no information" % url)
            return

        bill = Bill(
            bill_id,
            title=title,
            chamber=chamber,
            classification=bill_type,
            legislative_session=session,
        )
        bill.add_source(url)

        # Get sponsors
        spons_str = doc.xpath('//b[contains(text(), "SPONSOR")]')[0].tail.strip()
        sponsors_match = re.match(
            '(SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})',
            spons_str)
        if sponsors_match:
            sponsors = sponsors_match.group(2).split(',')
            sponsor = sponsors[0].strip()

            if sponsor:
                bill.add_sponsorship(
                    sponsors[0],
                    entity_type='person',
                    classification='primary',
                    primary=True,
                )

            for sponsor in sponsors[1:]:
                sponsor = sponsor.strip()
                if sponsor:
                    bill.add_sponsorship(
                        sponsor,
                        entity_type='person',
                        classification='cosponsor',
                        primary=False,
                    )
        else:
            # Committee sponsorship
            spons_str = spons_str.strip()

            if re.match(r' BY REQUEST OF THE GOVERNOR$', spons_str):
                spons_str = re.sub(r' BY REQUEST OF THE GOVERNOR$',
                                   '', spons_str).title()
                spons_str = (spons_str +
                             " Committee (by request of the governor)")

            if spons_str:
                bill.add_sponsorship(
                    spons_str,
                    entity_type='person',
                    classification='primary',
                    primary=True,
                )

        # Get actions from second myth table
        self._current_comm = None
        act_rows = doc.xpath('(//table[@class="myth"])[2]//tr')[1:]
        for row in act_rows:
            date, journal, raw_chamber, action = row.xpath('td')

            act_date = datetime.datetime.strptime(date.text_content().strip(),
                                                  '%m/%d/%y')
            raw_chamber = raw_chamber.text_content().strip()
            action = action.text_content().strip()

            if raw_chamber == "(H)":
                act_chamber = "lower"
            elif raw_chamber == "(S)":
                act_chamber = "upper"

            if re.match("\w+ Y(\d+)", action):
                vote_href = journal.xpath('.//a/@href')
                if vote_href:
                    yield from self.parse_vote(bill, action, act_chamber, act_date,
                                               vote_href[0])

            action, atype = self.clean_action(action)

            match = re.match('^Prefile released (\d+/\d+/\d+)$', action)
            if match:
                action = 'Prefile released'
                act_date = datetime.datetime.strptime(match.group(1), '%m/%d/%y')

            bill.add_action(
                action, chamber=act_chamber, date=act_date.strftime('%Y-%m-%d'),
                classification=atype)

        # Get subjects
        for subj in doc.xpath('//a[contains(@href, "subject")]/text()'):
            bill.add_subject(subj.strip())

        # Get versions
        text_list_url = (
            "http://www.legis.state.ak.us/"
            "basis/get_fulltext.asp?session=%s&bill=%s"
        ) % (session, bill_id)
        bill.add_source(text_list_url)

        text_doc = lxml.html.fromstring(self.get(text_list_url).text)
        text_doc.make_links_absolute(text_list_url)
        for link in text_doc.xpath('//a[contains(@href, "get_bill_text")]'):
            name = link.xpath('../preceding-sibling::td/text()')[0].strip()
            text_url = link.get('href')
            bill.add_version_link(name, text_url, media_type="text/html")

        # Get documents
        doc_list_url = (
            "http://www.legis.state.ak.us/"
            "basis/get_documents.asp?session=%s&bill=%s"
        ) % (session, bill_id)
        doc_list = lxml.html.fromstring(self.get(doc_list_url).text)
        doc_list.make_links_absolute(doc_list_url)
        bill.add_source(doc_list_url)
        for href in doc_list.xpath('//a[contains(@href, "get_documents")][@onclick]'):
            h_name = href.text_content()
            h_href = href.attrib['href']
            if h_name.strip():
                bill.add_document_link(h_name, h_href)

        yield bill
Ejemplo n.º 29
0
    def scrape_bill(self, chamber, session, bill_id, url):
        page = self.lxmlize(url)

        (header, ) = page.xpath('//h3[@class="heading"]/text()')
        title = header.replace(bill_id, "").strip()

        if '.B. ' in bill_id:
            bill_type = 'bill'
        elif bill_id.startswith('H.R. ') or bill_id.startswith('S.R. '):
            bill_type = 'resolution'
        elif '.C.R. ' in bill_id:
            bill_type = 'concurrent resolution'
        elif '.J.R. ' in bill_id:
            bill_type = 'joint resolution'

        for flag in SUB_BLACKLIST:
            if flag in bill_id:
                bill_id = bill_id.replace(flag, " ")
        bill_id = re.sub(r"\s+", " ", bill_id).strip()

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=bill_type)
        bill.add_source(url)

        primary_info = page.xpath('//div[@id="billsponsordiv"]')
        for info in primary_info:
            try:
                (title, name) = [
                    x.strip() for x in info.xpath('.//text()') if x.strip()
                ]
            except ValueError:
                self.warning(
                    "Could not find sponsor's name for {}".format(bill_id))
                continue
            assert title == "Bill Sponsor:"
            name = name.replace("Sen. ", "").replace("Rep. ", "")
            bill.add_sponsorship(name,
                                 classification='primary',
                                 entity_type='person',
                                 primary=True)
        floor_info = page.xpath('//div[@id="floorsponsordiv"]//text()')
        floor_info = [x.strip() for x in floor_info if x.strip()]
        if len(floor_info) in (0, 1):
            # This indicates that no floor sponsor was found
            pass
        elif len(floor_info) == 2:
            assert floor_info[0] == "Floor Sponsor:"
            floor_sponsor = floor_info[1].replace("Sen. ",
                                                  "").replace("Rep. ", "")
            bill.add_sponsorship(floor_sponsor,
                                 classification='cosponsor',
                                 entity_type='person',
                                 primary=False)
        else:
            raise AssertionError("Unexpected floor sponsor HTML found")

        versions = page.xpath(
            '//b[text()="Bill Text"]/following-sibling::ul/li/'
            'a[text() and not(text()=" ")]')

        for version in versions:

            # sometimes the href is on the following <a> tag and the tag we
            # have has an onclick
            url = version.get('href')
            if not url:
                url = version.xpath('following-sibling::a[1]/@href')[0]

            bill.add_version_link(version.xpath('text()')[0].strip(),
                                  url,
                                  media_type='application/pdf')

        for related in page.xpath(
                '//b[text()="Related Documents "]/following-sibling::ul/li/'
                'a[contains(@class,"nlink")]'):
            href = related.xpath('@href')[0]
            if '.fn.pdf' in href:
                bill.add_document_link("Fiscal Note",
                                       href,
                                       media_type='application/pdf')
            else:
                text = related.xpath('text()')[0]
                bill.add_document_link(text,
                                       href,
                                       media_type='application/pdf')

        subjects = []
        for link in page.xpath("//a[contains(@href, 'RelatedBill')]"):
            subjects.append(link.text.strip())
        bill.subject = subjects

        if page.xpath('//div[@id="billStatus"]//table'):
            status_table = page.xpath('//div[@id="billStatus"]//table')[0]
            yield from self.parse_status(bill, status_table, chamber)

        yield bill
Ejemplo n.º 30
0
    def scrape_bill(self, chamber, session, bill_id, url):
        try:
            page = lxml.html.fromstring(self.get(url).text)
        except scrapelib.HTTPError as e:
            self.warning('error (%s) fetching %s, skipping' % (e, url))
            return

        title = page.xpath(
            "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip()

        if 'JR' in bill_id:
            bill_type = ['joint resolution']
        elif 'CR' in bill_id:
            bill_type = ['concurrent resolution']
        elif 'R' in bill_id:
            bill_type = ['resolution']
        else:
            bill_type = ['bill']

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=bill_type)
        bill.add_source(url)
        bill.subject = self.subject_map[bill_id]

        for link in page.xpath("//a[contains(@id, 'Auth')]"):
            name = link.xpath("string()").strip()

            if ':' in name:
                raise Exception(name)
            if 'otherAuth' in link.attrib['id']:
                bill.add_sponsorship(name, classification='cosponsor',
                                     entity_type='person', primary=False)
            else:
                bill.add_sponsorship(name, classification='primary',
                                     entity_type='person', primary=True)

        act_table = page.xpath("//table[contains(@id, 'Actions')]")[0]
        for tr in act_table.xpath("tr")[2:]:
            action = tr.xpath("string(td[1])").strip()
            if not action or action == 'None':
                continue

            date = tr.xpath("string(td[3])").strip()
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            actor = tr.xpath("string(td[4])").strip()
            if actor == 'H':
                actor = 'lower'
            elif actor == 'S':
                actor = 'upper'

            attrs = self.categorizer.categorize(action)
            related_entities = []
            for item in attrs['committees']:
                related_entities.append({
                    'type': 'committee',
                    'name': item
                })
            for item in attrs['legislators']:
                related_entities.append({
                    'type': 'legislator',
                    'name': item
                })
            bill.add_action(description=action,
                            date=date.strftime('%Y-%m-%d'),
                            chamber=actor,
                            classification=attrs['classification'],
                            related_entities=related_entities)

        version_table = page.xpath("//table[contains(@id, 'Versions')]")[0]
        # Keep track of already seen versions to prevent processing duplicates.
        version_urls = []
        for link in version_table.xpath(".//a[contains(@href, '.PDF')]"):
            version_url = link.attrib['href']
            if version_url in version_urls:
                self.warning('Skipping duplicate version URL.')
                continue
            else:
                version_urls.append(version_url)
            name = link.text.strip()

            if re.search('COMMITTEE REPORTS|SCHEDULED CCR', version_url, re.IGNORECASE):
                bill.add_document_link(note=name, url=version_url,
                                       media_type='application/pdf')
                continue

            bill.add_version_link(note=name, url=version_url,
                                  media_type='application/pdf')

        for link in page.xpath(".//a[contains(@href, '_VOTES')]"):
            if 'HT_' not in link.attrib['href']:
                yield from self.scrape_votes(bill, self.urlescape(link.attrib['href']))

        # # If the bill has no actions and no versions, it's a bogus bill on
        # # their website, which appears to happen occasionally. Skip.
        has_no_title = (bill.title == "Short Title Not Found.")
        if has_no_title:
            # If there's no title, this is an empty page. Skip!
            return

        else:
            # Otherwise, save the bills.
            yield bill
Ejemplo n.º 31
0
    def get_bill(self, matter):
        '''Make Bill object from given matter.'''
        '''
        Currently, NYC Legistar does not have conventional "Types" for 
        three newly added committees: https://legistar.council.nyc.gov/Departments.aspx
        We communicated the issue to NYC, and until we learn more, we will
        skip the bills attached to those committees.
        '''
        orgs_without_type = [
            'Charter Revision Commission 2019',
            'New York City Advisory Commission on Property Tax Reform',
            'Democratic Conference of the Council of the City of New York'
        ]
        if matter['MatterBodyName'].strip() in orgs_without_type:
            return None

        matter_id = matter['MatterId']
        if matter_id in DUPLICATED_ACTIONS:
            return None

        date = matter['MatterIntroDate']
        title = matter['MatterName']
        identifier = matter['MatterFile']

        if not all((date, title, identifier)):
            return None

        leg_type = BILL_TYPES[matter['MatterTypeName']]

        bill_session = self.sessions(self.toTime(date))

        bill = Bill(identifier=identifier,
                    title=title,
                    classification=leg_type,
                    legislative_session=bill_session,
                    from_organization={"name": "New York City Council"})

        legistar_web = matter['legistar_url']
        legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)

        bill.add_source(legistar_web, note='web')
        bill.add_source(legistar_api, note='api')

        if matter['MatterTitle']:
            bill.add_title(matter['MatterTitle'])

        if matter['MatterEXText5']:
            bill.add_abstract(matter['MatterEXText5'], note='')

        try:
            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)
        except KeyError:
            self.version_errors.append(legistar_web)
            return None

        for attachment in self.attachments(matter_id):

            if attachment['MatterAttachmentId'] == 103315:  # Duplicate
                return None

            if attachment['MatterAttachmentName']:
                bill.add_document_link(attachment['MatterAttachmentName'],
                                       attachment['MatterAttachmentHyperlink'],
                                       media_type='application/pdf')

        for topic in self.topics(matter_id):
            bill.add_subject(topic['MatterIndexName'].strip())

        for relation in self.relations(matter_id):
            try:
                related_bill = self.endpoint(
                    '/matters/{0}', relation['MatterRelationMatterId'])
            except scrapelib.HTTPError:
                return None
            else:
                date = related_bill['MatterIntroDate']
                related_bill_session = self.session(self.toTime(date))
                identifier = related_bill['MatterFile']
                bill.add_related_bill(identifier=identifier,
                                      legislative_session=related_bill_session,
                                      relation_type='companion')

        try:
            text = self.text(matter_id)
        except KeyError:
            self.version_errors.append(legistar_web)
            return None

        bill.extras['local_classification'] = matter['MatterTypeName']

        if text:
            if text['MatterTextPlain']:
                bill.extras['plain_text'] = text['MatterTextPlain'].replace(
                    u'\u0000', '')

            if text['MatterTextRtf']:
                bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                    u'\u0000', '')

        return bill
Ejemplo n.º 32
0
    def scrape_bill(self, bill_id):
        old = self.api('bills/' + bill_id + '?')

        # not needed
        old.pop('id')
        old.pop('state')
        old.pop('level', None)
        old.pop('country', None)
        old.pop('created_at')
        old.pop('updated_at')
        old.pop('action_dates')
        old.pop('+bill_type',None)
        old.pop('+subject', None)
        old.pop('+scraped_subjects', None)
        old.pop('subjects', [])

        classification = old.pop('type')

        # ca weirdness
        if 'fiscal committee' in classification:
            classification.remove('fiscal committee')
        if 'urgency' in classification:
            classification.remove('urgency')
        if 'local program' in classification:
            classification.remove('local program')
        if 'tax levy' in classification:
            classification.remove('tax levy')

        if classification[0] in ['miscellaneous', 'jres', 'cres']:
            return

        if classification == ['memorial resolution'] and self.state == 'ar':
            classification = ['memorial']
        if classification == ['concurrent memorial resolution'] and self.state == 'ar':
            classification = ['concurrent memorial']
        if classification == ['joint session resolution'] and self.state == 'il':
            classification = ['joint resolution']
        if classification == ['legislative resolution'] and self.state == 'ny':
            classification = ['resolution']
        if classification == ['address'] and self.state == 'nh':
            classification = ['resolution']

        if not old['title'] and self.state == 'me':
            old['title'] = '(unknown)'

        chamber = old.pop('chamber')
        if self.state in ('ne', 'dc'):
            chamber = 'legislature'
        elif chamber in ('joint', 'conference'):
            chamber = 'legislature'

        new = Bill(old.pop('bill_id'), old.pop('session'), old.pop('title'),
                   chamber=chamber, classification=classification)

        abstract = old.pop('summary', None)
        if abstract:
            new.add_abstract(abstract, note='')

        for title in old.pop('alternate_titles'):
            new.add_title(title)

        for doc in old.pop('documents'):
            new.add_document_link(doc['name'], doc['url'], on_duplicate='ignore')

        for doc in old.pop('versions'):
            new.add_version_link(doc['name'], doc['url'], media_type=doc.pop('mimetype', ''))

        for subj in old.pop('scraped_subjects', []):
            if subj:
                new.add_subject(subj)

        for spon in old.pop('sponsors'):
            if spon.get('committee_id') is not None:
                entity_type = 'organization'
            elif spon.get('leg_id') is not None:
                entity_type = 'person'
            else:
                entity_type = ''
            new.add_sponsorship(spon['name'], spon['type'], entity_type,
                                spon['type'] == 'primary')

        for act in old.pop('actions'):
            actor = act['actor']
            if actor.lower() in ('governor', 'mayor', 'secretary of state'):
                actor = 'executive'
            elif actor.lower() == 'house' or (actor.lower().startswith('lower (') and self.state == 'ca'):
                actor = 'lower'
            elif actor.lower() in ('senate', 'upper`') or (actor.lower().startswith('upper (') and self.state == 'ca'):
                actor = 'upper'
            elif actor in ('joint', 'other', 'Data Systems', 'Speaker', 'clerk',
                           'Office of the Legislative Fiscal Analyst', 'Became Law w',
                           'conference') or (actor.lower().startswith('legislature (') and self.state == 'ca'):
                actor = 'legislature'

            if actor in ('committee', 'sponsor') and self.state == 'pr':
                actor = 'legislature'

            # nebraska & DC
            if actor in ('upper','council') and self.state in ('ne', 'dc'):
                actor = 'legislature'

            if act['action']:
                newact = new.add_action(act['action'], act['date'][:10], chamber=actor,
                                        classification=[action_types[c] for c in act['type'] if c != 'other'])
                for re in act.get('related_entities', []):
                    if re['type'] == 'committee':
                        re['type'] = 'organization'
                    elif re['type'] == 'legislator':
                        re['type'] = 'person'
                    newact.add_related_entity(re['name'], re['type'])

        for comp in old.pop('companions', []):
            if self.state in ('nj', 'ny', 'mn'):
                rtype = 'companion'
            new.add_related_bill(comp['bill_id'], comp['session'], rtype)

        for abid in old.pop('alternate_bill_ids', []) + old.pop('+alternate_bill_ids', []):
            new.add_identifier(abid)


        # generic OpenStates stuff
        for id in old.pop('all_ids'):
            new.add_identifier(id, scheme='openstates')

        for source in old.pop('sources'):
            source.pop('retrieved', None)
            new.add_source(**source)

        ext_title = old.pop('+extended_title', None)
        if ext_title:
            new.add_title(ext_title, note='Extended Title')
        official_title = old.pop('+official_title', None)
        if official_title:
            new.add_title(official_title, note='Official Title')

        to_extras = ['+status', '+final_disposition', '+volume_chapter', '+ld_number', '+referral',
                     '+companion', '+description', '+fiscal_note_probable:',
                     '+preintroduction_required:', '+drafter', '+category:', '+chapter',
                     '+requester', '+transmittal_date:', '+by_request_of', '+bill_draft_number:',
                     '+bill_lr', '+bill_url', '+rcs_num', '+fiscal_note', '+impact_clause', '+fiscal_notes',
                     '+short_title', '+type_', '+conference_committee', 'conference_committee',
                     '+companion_bill_ids', '+additional_information']
        for k in to_extras:
            v = old.pop(k, None)
            if v:
                new.extras[k.replace('+', '')] = v

        # votes
        vote_no = 1
        for vote in old.pop('votes'):
            vote.pop('id')
            vote.pop('state')
            vote.pop('bill_id')
            vote.pop('bill_chamber', None)
            vote.pop('+state', None)
            vote.pop('+country', None)
            vote.pop('+level', None)
            vote.pop('+vacant', None)
            vote.pop('+not_voting', None)
            vote.pop('+amended', None)
            vote.pop('+excused', None)
            vote.pop('+NV', None)
            vote.pop('+AB', None)
            vote.pop('+P', None)
            vote.pop('+V', None)
            vote.pop('+E', None)
            vote.pop('+EXC', None)
            vote.pop('+EMER', None)
            vote.pop('+present', None)
            vote.pop('+absent', None)
            vote.pop('+seconded', None)
            vote.pop('+moved', None)
            vote.pop('+vote_type', None)
            vote.pop('+actual_vote', None)
            vote.pop('+skip_votes', None)
            vote.pop('vote_id')
            vote.pop('+bill_chamber', None)
            vote.pop('+session', None)
            vote.pop('+bill_id', None)
            vote.pop('+bill_session', None)
            vote.pop('committee', None)
            vote.pop('committee_id', None)
            vtype = vote.pop('type', 'passage')

            if vtype == 'veto_override':
                vtype = ['veto-override']
            elif vtype == 'amendment':
                vtype = ['amendment-passage']
            elif vtype == 'other':
                vtype = ''
            else:
                vtype = ['bill-passage']

            # most states need identifiers for uniqueness, just do it everywhere
            identifier = vote['date'] + '-' + str(vote_no)
            vote_no += 1

            chamber = vote.pop('chamber')
            if chamber == 'upper' and self.state in ('ne', 'dc'):
                chamber = 'legislature'
            elif chamber == 'joint':
                chamber = 'legislature'

            newvote = VoteEvent(legislative_session=vote.pop('session'),
                           motion_text=vote.pop('motion'),
                           result='pass' if vote.pop('passed') else 'fail',
                           chamber=chamber,
                           start_date=vote.pop('date'),
                           classification=vtype,
                           bill=new,
                           identifier=identifier)
            for vt in ('yes', 'no', 'other'):
                newvote.set_count(vt, vote.pop(vt + '_count'))
                for name in vote.pop(vt + '_votes'):
                    newvote.vote(vt, name['name'])

            for source in vote.pop('sources'):
                source.pop('retrieved', None)
                newvote.add_source(**source)

            if not newvote.sources:
                newvote.sources = new.sources

            to_extras = ['+record', '+method', 'method', '+filename', 'record', '+action',
                         '+location', '+rcs_num', '+type_', '+threshold', '+other_vote_detail',
                         '+voice_vote']
            for k in to_extras:
                v = vote.pop(k, None)
                if v:
                    newvote.extras[k.replace('+', '')] = v

            assert not vote, vote.keys()
            yield newvote

        assert not old, old.keys()

        yield new
Ejemplo n.º 33
0
    def scrape_bills(self, chamber_to_scrape, session):
        url = 'http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml' % session

        bill_dir_page = self.get(url)
        root = lxml.etree.fromstring(bill_dir_page.content)
        for mr in root.xpath('//LASTACTION/MSRGROUP'):
            bill_id = mr.xpath('string(MEASURE)').replace(" ", "")
            if bill_id[0] == "S":
                chamber = "upper"
            else:
                chamber = "lower"

            bill_type = {
                'B': 'bill',
                'C': 'concurrent resolution',
                'R': 'resolution',
                'N': 'nomination'
            }[bill_id[1]]

            # just skip past bills that are of the wrong chamber
            if chamber != chamber_to_scrape:
                continue

            link = mr.xpath('string(ACTIONLINK)').replace("..", "")
            main_doc = mr.xpath('string(MEASURELINK)').replace("../../../", "")
            main_doc_url = 'http://billstatus.ls.state.ms.us/%s' % main_doc
            bill_details_url = 'http://billstatus.ls.state.ms.us/%s/pdf/%s' % (
                session, link)
            try:
                details_page = self.get(bill_details_url)
            except scrapelib.HTTPError:
                self.warning(
                    'Bill page not loading for {}; skipping'.format(bill_id))
                continue

            page = details_page.content
            # Some pages have the (invalid) byte 11 sitting around. Just drop
            # them out. Might as well.

            details_root = lxml.etree.fromstring(page)
            title = details_root.xpath('string(//SHORTTITLE)')
            longtitle = details_root.xpath('string(//LONGTITLE)')

            bill = Bill(bill_id,
                        legislative_session=session,
                        chamber=chamber,
                        title=title,
                        classification=bill_type)
            bill.extras['summary'] = longtitle
            bill.add_source(main_doc_url)
            # sponsors
            main_sponsor = details_root.xpath('string(//P_NAME)').split()
            if main_sponsor:
                main_sponsor = main_sponsor[0]
                main_sponsor_link = details_root.xpath(
                    'string(//P_LINK)').replace(" ", "_")
                main_sponsor_url = (
                    'http://billstatus.ls.state.ms.us/%s/'
                    'pdf/%s') % (session, main_sponsor_link.strip('../'))
                type = "primary"
                bill.add_source(main_sponsor_url)
                bill.add_sponsorship(main_sponsor,
                                     classification=type,
                                     entity_type='person',
                                     primary=True)

            for author in details_root.xpath('//AUTHORS/ADDITIONAL'):
                leg = author.xpath('string(CO_NAME)').replace(" ", "_")
                if leg:
                    leg_url = ('http://billstatus.ls.state.ms.us/%s/'
                               'pdf/House_authors/%s.xml') % (session, leg)
                    type = "cosponsor"
                    bill.add_source(leg_url)
                    bill.add_sponsorship(leg,
                                         classification=type,
                                         entity_type='person',
                                         primary=False)
            # Versions
            curr_version = details_root.xpath('string(//CURRENT_OTHER'
                                              ')').replace("../../../../", "")
            if curr_version != "":
                curr_version_url = "http://billstatus.ls.state.ms.us/" \
                        + curr_version
                bill.add_version_link("Current version",
                                      curr_version_url,
                                      on_duplicate="ignore",
                                      media_type="text/html")

            intro_version = details_root.xpath(
                'string(//INTRO_OTHER)').replace("../../../../", "")
            if intro_version != "":
                intro_version_url = "http://billstatus.ls.state.ms.us/"\
                        + intro_version
                bill.add_version_link("As Introduced",
                                      intro_version_url,
                                      on_duplicate='ignore',
                                      media_type='text/html')

            comm_version = details_root.xpath('string(//CMTESUB_OTHER'
                                              ')').replace("../../../../", "")
            if comm_version.find("documents") != -1:
                comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version
                bill.add_version_link("Committee Substitute",
                                      comm_version_url,
                                      on_duplicate='ignore',
                                      media_type='text/html')
            passed_version = details_root.xpath('string(//PASSED_OTHER'
                                                ')').replace(
                                                    "../../../../", "")
            if passed_version.find("documents") != -1:
                passed_version_url = "http://billstatus.ls.state.ms.us/" + passed_version
                title = "As Passed the " + chamber
                bill.add_version_link(title,
                                      passed_version_url,
                                      on_duplicate='ignore',
                                      media_type='text/html')

            asg_version = details_root.xpath('string(//ASG_OTHER)').replace(
                "../../../../", "")
            if asg_version.find("documents") != -1:
                asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version
                bill.add_version_link("Approved by the Governor",
                                      asg_version_url,
                                      on_duplicate='ignore',
                                      media_type='text/html')

            # avoid duplicate votes
            seen_votes = set()

            # Actions
            for action in details_root.xpath('//HISTORY/ACTION'):
                # action_num  = action.xpath('string(ACT_NUMBER)').strip()
                # action_num = int(action_num)
                act_vote = action.xpath('string(ACT_VOTE)').replace(
                    "../../../..", "")
                action_desc = action.xpath('string(ACT_DESC)')
                date, action_desc = action_desc.split(" ", 1)
                date = date + "/" + session[0:4]
                date = datetime.strptime(date, "%m/%d/%Y")

                if action_desc.startswith("(H)"):
                    actor = "lower"
                    action = action_desc[4:]
                elif action_desc.startswith("(S)"):
                    actor = "upper"
                    action = action_desc[4:]
                else:
                    actor = "executive"
                    action = action_desc

                if "Veto" in action and actor == 'executive':
                    version_path = details_root.xpath("string(//VETO_OTHER)")
                    version_path = version_path.replace("../../../../", "")
                    version_url = "http://billstatus.ls.state.ms.us/" + version_path
                    bill.add_document_link("Veto", version_url)

                atype = 'other'
                for prefix, prefix_type in self._action_types:
                    if action.startswith(prefix):
                        atype = prefix_type
                        break

                bill.add_action(
                    action,
                    self._tz.localize(date),
                    chamber=actor,
                    classification=atype if atype is not 'other' else None)

                # use committee names as scraped subjects
                subjects = details_root.xpath('//H_NAME/text()')
                subjects += details_root.xpath('//S_NAME/text()')

                for subject in subjects:
                    if subject not in bill.subject:
                        bill.add_subject(subject)

                if act_vote:
                    vote_url = 'http://billstatus.ls.state.ms.us%s' % act_vote
                    if vote_url not in seen_votes:
                        seen_votes.add(vote_url)
                        yield from self.scrape_votes(vote_url, action, date,
                                                     actor, bill)

            bill.add_source(bill_details_url)
            yield bill
Ejemplo n.º 34
0
    def scrape(self):
        for leg_summary in self.legislation(created_after=datetime.datetime(2014, 1, 1)) :
            leg_type = BILL_TYPES[leg_summary['Type']]
            
            bill = Bill(identifier=leg_summary['File\xa0#'],
                        title=leg_summary['Title'],
                        legislative_session=None,
                        classification=leg_type,
                        from_organization={"name":"New York City Council"})
            bill.add_source(leg_summary['url'])

            leg_details = self.legDetails(leg_summary['url'])
            history = self.history(leg_summary['url'])

            bill.add_title(leg_details['Name'], 
                           note='created by administrative staff')

            if 'Summary' in leg_details :
                bill.add_abstract(leg_details['Summary'], note='')

            if leg_details['Law number'] :
                bill.add_identifier(leg_details['Law number'], 
                                    note='law number')

            for sponsorship in self._sponsors(leg_details.get('Sponsors', [])) :
                sponsor, sponsorship_type, primary = sponsorship
                bill.add_sponsorship(sponsor, sponsorship_type,
                                     'person', primary, 
                                     entity_id = make_pseudo_id(name=sponsor))

            
            for attachment in leg_details.get('Attachments', []) :
                bill.add_document_link(attachment['label'],
                                       attachment['url'],
                                       media_type="application/pdf")

            history = list(history)

            if history :
                earliest_action = min(self.toTime(action['Date']) 
                                      for action in history)

                bill.legislative_session = self.sessions(earliest_action)
            else :
                bill.legislative_session = str(self.SESSION_STARTS[0])

            for action in history :
                action_description = action['Action']
                if not action_description :
                    continue
                    
                action_class = ACTION_CLASSIFICATION[action_description]

                action_date = self.toDate(action['Date'])
                responsible_org = action['Action\xa0By']
                if responsible_org == 'City Council' :
                    responsible_org = 'New York City Council'
                elif responsible_org == 'Administration' :
                    responsible_org = 'Mayor'
                   
                if responsible_org == 'Town Hall Meeting' :
                    continue
                else :
                    act = bill.add_action(action_description,
                                          action_date,
                                          organization={'name': responsible_org},
                                          classification=action_class)

                if 'url' in action['Action\xa0Details'] :
                    action_detail_url = action['Action\xa0Details']['url']
                    if action_class == 'committee-referral' :
                        action_details = self.actionDetails(action_detail_url)
                        referred_committee = action_details['Action text'].rsplit(' to the ', 1)[-1]
                        act.add_related_entity(referred_committee,
                                               'organization',
                                               entity_id = make_pseudo_id(name=referred_committee))
                    result, votes = self.extractVotes(action_detail_url)
                    if votes :
                        action_vote = VoteEvent(legislative_session=bill.legislative_session, 
                                           motion_text=action_description,
                                           organization={'name': responsible_org},
                                           classification=action_class,
                                           start_date=action_date,
                                           result=result,
                                           bill=bill)
                        action_vote.add_source(action_detail_url)

                        for option, voter in votes :
                            action_vote.vote(option, voter)


                        yield action_vote
            
            text = self.text(leg_summary['url'])

            if text :
                bill.extras = {'local_classification' : leg_summary['Type'],
                               'full_text' : text}
            else :
                bill.extras = {'local_classification' : leg_summary['Type']}

            yield bill
Ejemplo n.º 35
0
    def parse_bill(self, chamber, session, bill_id, url):
        try:
            page = self.lxmlize(url)
        except scrapelib.HTTPError as e:
            self.logger.warning(e)
            return

        last_action = self.parse_bill_field(page,
                                            "Last Action").xpath("text()")[0]
        if "WITHDRAWN" in last_action.upper():
            self.info("{} Withdrawn, skipping".format(bill_id))
            return

        version = self.parse_bill_field(page, "Bill Documents")
        source_url = version.xpath("a[1]/@href")[0]
        version_title = version.xpath("a[1]/text()")[0].strip()

        if version is None:
            # Bill withdrawn
            self.logger.warning("Bill withdrawn.")
            return
        else:
            if source_url.endswith(".doc"):
                mimetype = "application/msword"
            elif source_url.endswith(".pdf"):
                mimetype = "application/pdf"

        title = self.parse_bill_field(page, "Title").text_content()

        # actions = self.get_nodes(
        #     page,
        #     '//div[@class="StandardText leftDivMargin"]/'
        #     'div[@class="StandardText"][last()]//text()[normalize-space()]')

        if "CR" in bill_id:
            bill_type = "concurrent resolution"
        elif "JR" in bill_id:
            bill_type = "joint resolution"
        elif "R" in bill_id:
            bill_type = "resolution"
        else:
            bill_type = "bill"

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )
        bill.subject = self._subjects[bill_id]
        bill.add_source(url)

        bill.add_version_link(version_title, source_url, media_type=mimetype)

        self.parse_actions(page, bill, chamber)
        self.parse_subjects(page, bill)

        # LM is "Locally Mandated fiscal impact"
        fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]')
        for fiscal_note in fiscal_notes:
            source_url = fiscal_note.attrib["href"]
            if source_url.endswith(".doc"):
                mimetype = "application/msword"
            elif source_url.endswith(".pdf"):
                mimetype = "application/pdf"

            bill.add_document_link("Fiscal Note",
                                   source_url,
                                   media_type=mimetype)

        for link in page.xpath(
                "//td/span/a[contains(@href, 'Legislator-Profile')]"):
            bill.add_sponsorship(
                link.text.strip(),
                classification="primary",
                entity_type="person",
                primary=True,
            )

        bdr_no = self.parse_bill_field(page, "Bill Request Number")
        if bdr_no.xpath("text()"):
            bdr = bdr_no.xpath("text()")[0].strip()
            bill.extras["BDR"] = bdr

        yield bill
Ejemplo n.º 36
0
    def scrape_assem_bills(self, chamber, insert, session, year):

        doc_type = {1: 'bill', 3: 'resolution', 5: 'concurrent resolution',
                    6: 'joint resolution', 9: 'petition'}
        for docnum, bill_type in doc_type.items():
            parentpage_url = 'http://www.leg.state.nv.us/Session/%s/' \
                             'Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum)
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count = count + 1
                page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)
                page = self.get(page_path).text
                page = page.replace(u"\xa0", " ")
                root = lxml.html.fromstring(page)
                root.make_links_absolute("http://www.leg.state.nv.us/")

                bill_id = root.xpath('string(/html/body/div[@id="content"]'
                                     '/table[1]/tr[1]/td[1]/font)')
                title = self.get_node(
                    root,
                    '//div[@id="content"]/table/tr[preceding-sibling::tr/td/'
                    'b[contains(text(), "By:")]]/td/em/text()')

                bill = Bill(bill_id, legislative_session=session, chamber=chamber,
                            title=title, classification=bill_type)

                bill.subject = list(set(self.subject_mapping[bill_id]))
                billtext = root.xpath("//b[text()='Bill Text']")[0].getparent().getnext()
                text_urls = billtext.xpath("./a")
                for text_url in text_urls:
                    version_name = text_url.text.strip()
                    version_url = text_url.attrib['href']
                    bill.add_version_link(note=version_name, url=version_url,
                                          media_type='application/pdf')

                primary, secondary = self.scrape_sponsors(page)

                for leg in primary:
                    bill.add_sponsorship(classification='primary',
                                         name=leg, entity_type='person',
                                         primary=True)
                for leg in secondary:
                    bill.add_sponsorship(classification='cosponsor',
                                         name=leg, entity_type='person',
                                         primary=False)

                minutes_count = 2
                for mr in root.xpath('//table[4]/tr/td[3]/a'):
                    minutes = mr.xpath("string(@href)")
                    minutes_url = "http://www.leg.state.nv.us" + minutes
                    minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
                    minutes_date = mr.xpath(minutes_date_path).split()
                    minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes"
                    bill.add_document_link(note=minutes_date, url=minutes_url)
                    minutes_count += 1

                self.scrape_actions(root, bill, "lower")
                yield from self.scrape_votes(page, page_path, bill, insert, year)
                bill.add_source(page_path)
                yield bill
Ejemplo n.º 37
0
    def scrape_bill_page(self, chamber, session, bill_url, bill_abbreviation):
        page = self.lxmlize(bill_url)
        author = self.get_one_xpath(
            page,
            "//a[@id='ctl00_PageBody_LinkAuthor']/text()")

        def sbp(x): return self.scrape_bare_page(page.xpath(
            "//a[contains(text(), '%s')]" % (x))[0].attrib['href'])

        authors = [x.text for x in sbp("Authors")]

        try:
            digests = sbp("Digests")
        except IndexError:
            digests = []

        try:
            versions = sbp("Text")
        except IndexError:
            versions = []

        try:
            amendments = sbp("Amendments")
        except IndexError:
            amendments = []

        title = page.xpath(
            "//span[@id='ctl00_PageBody_LabelShortTitle']/text()")[0]
        actions = page.xpath(
            "//div[@id='ctl00_PageBody_PanelBillInfo']/"
            "/table[@style='font-size:small']/tr")

        bill_id = page.xpath(
            "//span[@id='ctl00_PageBody_LabelBillID']/text()")[0]

        bill_type = self._bill_types[bill_abbreviation[1:]]
        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=bill_type)
        bill.add_source(bill_url)

        authors.remove(author)
        bill.add_sponsorship(author,
                             classification='primary',
                             entity_type='person',
                             primary=True)
        for author in authors:
            bill.add_sponsorship(author,
                                 classification='cosponsor',
                                 entity_type='person',
                                 primary=False)

        for digest in digests:
            bill.add_document_link(note=digest.text,
                                   url=digest.attrib['href'],
                                   media_type="application/pdf")

        for version in versions:
            bill.add_version_link(note=version.text,
                                  url=version.attrib['href'],
                                  media_type="application/pdf")

        for amendment in amendments:
            bill.add_version_link(note=amendment.text,
                                  url=amendment.attrib['href'],
                                  media_type="application/pdf")

        flags = {
            "prefiled": ["filing"],
            "referred to the committee": ["referral-committee"],
            "sent to the house": ['passage'],
            "ordered returned to the house": ['passage'],
            "ordered to the senate": ['passage'],
            "signed by the governor": ['executive-signature'],
            "sent to the governor": ['executive-receipt'],
        }

        try:
            votes_link = page.xpath("//a[text() = 'Votes']")[0]
            yield from self.scrape_votes(bill, votes_link.attrib['href'])
        except IndexError:
            # Some bills don't have any votes
            pass

        for action in actions:
            date, chamber, page, text = [x.text for x in action.xpath(".//td")]
            session_year = self.jurisdiction.legislative_sessions[-1]['start_date'][0:4]
            # Session is April -> June. Prefiles look like they're in
            # January at earliest.
            date += '/{}'.format(session_year)
            date = dt.datetime.strptime(date, '%m/%d/%Y')
            chamber = self._chambers[chamber]

            cat = []
            for flag in flags:
                if flag in text.lower():
                    cat += flags[flag]

            bill.add_action(description=text,
                            date=date.strftime('%Y-%m-%d'),
                            chamber=chamber,
                            classification=cat)

        yield bill
    def scrape_senate_bills(self, chamber, insert, session, year):
        doc_type = {2: 'bill', 4: 'resolution', 7: 'concurrent resolution',
                    8: 'joint resolution'}

        for docnum, bill_type in doc_type.items():
            parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/' \
                             'HistListBills.cfm?DoctypeID=%s' % (insert, docnum)
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count += 1
                page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)

                page = self.get(page_path).text
                page = page.replace(u"\xa0", " ")
                root = lxml.html.fromstring(page)

                bill_id = root.xpath('string(/html/body/div[@id="content"]' +
                                     '/table[1]/tr[1]/td[1]/font)')
                title = self.get_node(
                    root,
                    '//div[@id="content"]/table/tr[preceding-sibling::tr/td/'
                    'b[contains(text(), "By:")]]/td/em/text()')

                bill = Bill(bill_id,
                            legislative_session=session,
                            chamber=chamber,
                            title=title,
                            classification=bill_type
                            )
                bill.subject = list(set(self.subject_mapping[bill_id]))

                for table in root.xpath('//div[@id="content"]/table'):
                    if 'Bill Text' in table.text_content():
                        bill_text = table.xpath("string(tr/td[2]/a/@href)")
                        text_url = "http://www.leg.state.nv.us" + bill_text
                        bill.add_version_link(note="Bill Text",
                                              url=text_url,
                                              media_type='application/pdf')

                primary, secondary = self.scrape_sponsors(page)

                for leg in primary:
                    bill.add_sponsorship(name=leg,
                                         classification='primary',
                                         entity_type='person',
                                         primary=True)
                for leg in secondary:
                    bill.add_sponsorship(name=leg,
                                         classification='cosponsor',
                                         entity_type='person',
                                         primary=False)

                minutes_count = 2
                for mr in root.xpath('//table[4]/tr/td[3]/a'):
                    minutes = mr.xpath("string(@href)")
                    minutes_url = "http://www.leg.state.nv.us" + minutes
                    minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
                    minutes_date = mr.xpath(minutes_date_path).split()
                    minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Agenda"
                    # bill.add_document(minutes_date, minutes_url)
                    bill.add_document_link(note=minutes_date,
                                           url=minutes_url)
                    minutes_count = minutes_count + 1

                self.scrape_actions(root, bill, "upper")
                yield from self.scrape_votes(page, page_path, bill, insert, year)
                bill.add_source(page_path)
                yield bill
Ejemplo n.º 39
0
    def scrape(self, window=28, matter_ids=None):
        '''By default, scrape board reports updated in the last 28 days.
        Optionally specify a larger or smaller window of time from which to
        scrape updates, or specific matters to scrape.
        Note that passing a value for :matter_ids supercedes the value of
        :window, such that the given matters will be scraped regardless of
        when they were updated.

        Optional parameters
        :window (numeric) - Amount of time for which to scrape updates, e.g.
        a window of 7 will scrape legislation updated in the last week. Pass
        a window of 0 to scrape all legislation.
        :matter_ids (str) - Comma-separated list of matter IDs to scrape
        '''

        if matter_ids:
            matters = [
                self.matter(matter_id) for matter_id in matter_ids.split(',')
            ]
            matters = filter(
                None, matters)  # Skip matters that are not yet in Legistar
        elif float(window):  # Support for partial days, i.e., window=0.15
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
                float(window))
            matters = self.matters(n_days_ago)
        else:
            # Scrape all matters, including those without a last-modified date
            matters = self.matters()

        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
            float(window))
        for matter in matters:
            # Skip this bill, until Metro cleans up duplicate in Legistar API
            if matter['MatterFile'] == '2017-0447':
                continue

            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)):
                continue

            # Do not scrape private bills introduced before this timestamp.
            if self._is_restricted(matter) and (
                    date < self.START_DATE_PRIVATE_SCRAPE):
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Board of Directors"})

            # The Metro scraper scrapes private bills.
            # However, we do not want to capture significant data about private bills,
            # other than the value of the helper function `_is_restricted` and a last modified timestamp.
            # We yield private bills early, wipe data from previously imported once-public bills,
            # and include only data *required* by the pupa schema.
            # https://github.com/opencivicdata/pupa/blob/master/pupa/scrape/schemas/bill.py
            bill.extras = {'restrict_view': self._is_restricted(matter)}

            # Add API source early.
            # Private bills should have this url for debugging.
            legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)
            bill.add_source(legistar_api, note='api')

            if self._is_restricted(matter):
                # required fields
                bill.title = 'Restricted View'

                # wipe old data
                bill.extras['plain_text'] = ''
                bill.extras['rtf_text'] = ''
                bill.sponsorships = []
                bill.related_bills = []
                bill.versions = []
                bill.documents = []
                bill.actions = []

                yield bill
                continue

            legistar_web = matter['legistar_url']
            bill.add_source(legistar_web, note='web')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id):
                act = bill.add_action(**action)

                if action['description'] == 'Referred':
                    body_name = matter['MatterBodyName']
                    act.add_related_entity(
                        body_name,
                        'organization',
                        entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote
                if result:
                    vote_event = VoteEvent(
                        legislative_session=bill.legislative_session,
                        motion_text=action['description'],
                        organization=action['organization'],
                        classification=None,
                        start_date=action['date'],
                        result=result,
                        bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes:
                        try:
                            raw_option = vote['VoteValueName'].lower()
                        except AttributeError:
                            raw_option = None
                        clean_option = self.VOTE_OPTIONS.get(
                            raw_option, raw_option)
                        vote_event.vote(clean_option,
                                        vote['VotePersonName'].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic['MatterIndexName'].strip())

            for relation in self.relations(matter_id):
                try:
                    # Get data (i.e., json) for the related bill.
                    # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session).
                    # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue.
                    related_bill = self.endpoint(
                        '/matters/{0}', relation['MatterRelationMatterId'])
                except scrapelib.HTTPError:
                    continue
                else:
                    date = related_bill['MatterIntroDate']
                    related_bill_session = self.session(self.toTime(date))
                    identifier = related_bill['MatterFile']
                    bill.add_related_bill(
                        identifier=identifier,
                        legislative_session=related_bill_session,
                        relation_type='companion')
                    # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104
                    # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'.

            bill.add_version_link(
                'Board Report',
                'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'
                .format(matter_id),
                media_type="application/pdf")

            for attachment in self.attachments(matter_id):
                if attachment['MatterAttachmentName']:
                    bill.add_document_link(
                        attachment['MatterAttachmentName'],
                        attachment['MatterAttachmentHyperlink'].strip(),
                        media_type="application/pdf")

            bill.extras['local_classification'] = matter['MatterTypeName']

            matter_version_value = matter['MatterVersion']
            text = self.text(matter_id, matter_version_value)

            if text:
                if text['MatterTextPlain']:
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf']:
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                        u'\u0000', '')

            yield bill
Ejemplo n.º 40
0
    def scrape_bill(self, chamber, session, bill_id, short_title=None):
        """
        Scrapes documents, actions, vote counts and votes for
        bills from the 2009 session and above.
        """
        url = BILL_URL % (session, bill_id.replace(" ", ""))
        bill_page = self.get(url).text
        html = lxml.html.fromstring(bill_page)
        html.make_links_absolute(
            "http://legislature.idaho.gov/legislation/%s/" % session)
        bill_tables = html.xpath('//table[contains(@class, "bill-table")]')
        title = bill_tables[1].text_content().strip()
        bill_type = get_bill_type(bill_id)
        bill = Bill(
            legislative_session=session,
            chamber=chamber,
            identifier=bill_id,
            title=title,
            classification=bill_type,
        )
        bill.add_source(url)
        for subject in self._subjects[bill_id.replace(" ", "")]:
            bill.add_subject(subject)

        if short_title and title.lower() != short_title.lower():
            bill.add_title(short_title, "short title")

        # documents
        doc_links = html.xpath('//div[contains(@class,"insert-page")]//a')
        for link in doc_links:
            name = link.text_content().strip()
            href = link.get("href")
            if "Engrossment" in name or "Bill Text" in name or "Amendment" in name:
                bill.add_version_link(note=name,
                                      url=href,
                                      media_type="application/pdf")
            else:
                bill.add_document_link(note=name,
                                       url=href,
                                       media_type="application/pdf")

        def _split(string):
            return re.split(r"\w+[,|AND]\s+", string)

        # sponsors range from a committee to one legislator to a group of legs
        sponsor_lists = bill_tables[0].text_content().split("by")
        if len(sponsor_lists) > 1:
            for sponsors in sponsor_lists[1:]:
                if "COMMITTEE" in sponsors.upper():
                    bill.add_sponsorship(
                        name=sponsors.strip(),
                        entity_type="organization",
                        primary=True,
                        classification="primary",
                    )
                else:
                    for person in _split(sponsors):
                        person = person.strip()
                        if person != "":
                            bill.add_sponsorship(
                                classification="primary",
                                name=person,
                                entity_type="person",
                                primary=True,
                            )

        actor = chamber
        last_date = None
        # if a bill has passed a chamber or been 'received from'
        # then the next committee passage is in the opposite chamber
        has_moved_chambers = False
        for row in bill_tables[2]:
            # lots of empty rows
            if len(row) == 1:
                continue
            _, date, action, _ = [x.text_content().strip() for x in row]

            if date:
                last_date = date
            else:
                date = last_date
            date = datetime.datetime.strptime(date + "/" + session[0:4],
                                              "%m/%d/%Y").strftime("%Y-%m-%d")
            if action.startswith("House"):
                actor = "lower"
            elif action.startswith("Senate"):
                actor = "upper"

            # votes
            if "AYES" in action or "NAYS" in action:
                yield from self.parse_vote(actor, date, row[2], session,
                                           bill_id, chamber, url)
                # bill.add_vote_event(vote)
            # some td's text is seperated by br elements
            if len(row[2]):
                action = "".join(row[2].itertext())
            action = action.replace(u"\xa0", " ").strip()
            atype = get_action(actor, action)
            if atype and "passage" in atype:
                has_moved_chambers = True

            if atype and "committee-passage" in atype and has_moved_chambers:
                actor = _OTHER_CHAMBERS[actor]

            bill.add_action(action, date, chamber=actor, classification=atype)
            # after voice vote/roll call and some actions the bill is sent
            # 'to House' or 'to Senate'
            if "to House" in action:
                actor = "lower"
            elif "to Senate" in action:
                actor = "upper"
        yield bill
Ejemplo n.º 41
0
    def scrape_bill(self, chamber, session, bill_id, url):
        try:
            page = lxml.html.fromstring(self.get(url).text)
        except scrapelib.HTTPError as e:
            self.warning('error (%s) fetching %s, skipping' % (e, url))
            return

        title = page.xpath(
            "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip()
        if not title:
            self.warning('blank bill on %s - skipping', url)
            return

        if 'JR' in bill_id:
            bill_type = ['joint resolution']
        elif 'CR' in bill_id:
            bill_type = ['concurrent resolution']
        elif 'R' in bill_id:
            bill_type = ['resolution']
        else:
            bill_type = ['bill']

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=bill_type)
        bill.add_source(url)
        bill.subject = self.subject_map[bill_id]

        for link in page.xpath("//a[contains(@id, 'Auth')]"):
            name = link.xpath("string()").strip()

            if ':' in name:
                raise Exception(name)
            if 'otherAuth' in link.attrib['id']:
                bill.add_sponsorship(name, classification='cosponsor',
                                     entity_type='person', primary=False)
            else:
                bill.add_sponsorship(name, classification='primary',
                                     entity_type='person', primary=True)

        act_table = page.xpath("//table[contains(@id, 'Actions')]")[0]
        for tr in act_table.xpath("tr")[2:]:
            action = tr.xpath("string(td[1])").strip()
            if not action or action == 'None':
                continue

            date = tr.xpath("string(td[3])").strip()
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            actor = tr.xpath("string(td[4])").strip()
            if actor == 'H':
                actor = 'lower'
            elif actor == 'S':
                actor = 'upper'

            attrs = self.categorizer.categorize(action)
            related_entities = []
            for item in attrs['committees']:
                related_entities.append({
                    'type': 'committee',
                    'name': item
                })
            for item in attrs['legislators']:
                related_entities.append({
                    'type': 'legislator',
                    'name': item
                })
            bill.add_action(description=action,
                            date=date.strftime('%Y-%m-%d'),
                            chamber=actor,
                            classification=attrs['classification'],
                            related_entities=related_entities)

        version_table = page.xpath("//table[contains(@id, 'Versions')]")[0]
        # Keep track of already seen versions to prevent processing duplicates.
        version_urls = []
        for link in version_table.xpath(".//a[contains(@href, '.PDF')]"):
            version_url = link.attrib['href']
            if version_url in version_urls:
                self.warning('Skipping duplicate version URL.')
                continue
            else:
                version_urls.append(version_url)
            name = link.text.strip()

            if re.search('COMMITTEE REPORTS|SCHEDULED CCR', version_url, re.IGNORECASE):
                bill.add_document_link(note=name, url=version_url,
                                       media_type='application/pdf')
                continue

            bill.add_version_link(note=name, url=version_url,
                                  media_type='application/pdf')

        for link in page.xpath(".//a[contains(@href, '_VOTES')]"):
            if 'HT_' not in link.attrib['href']:
                yield from self.scrape_votes(bill, self.urlescape(link.attrib['href']))

        # # If the bill has no actions and no versions, it's a bogus bill on
        # # their website, which appears to happen occasionally. Skip.
        has_no_title = (bill.title == "Short Title Not Found.")
        if has_no_title:
            # If there's no title, this is an empty page. Skip!
            return

        else:
            # Otherwise, save the bills.
            yield bill
Ejemplo n.º 42
0
    def parse_bill(self, chamber, session, bill_id, url):
        page = self.lxmlize(url)

        last_action = self.parse_bill_field(page,
                                            'Last Action').xpath('text()')[0]
        if 'WITHDRAWN' in last_action.upper():
            self.info("{} Withdrawn, skipping".format(bill_id))
            return

        version = self.parse_bill_field(page, 'Bill Documents')
        source_url = version.xpath('a[1]/@href')[0]
        version_title = version.xpath('a[1]/text()')[0].strip()

        if version is None:
            # Bill withdrawn
            self.logger.warning('Bill withdrawn.')
            return
        else:
            if source_url.endswith('.doc'):
                mimetype = 'application/msword'
            elif source_url.endswith('.pdf'):
                mimetype = 'application/pdf'

        title = self.parse_bill_field(page, 'Title').text_content()

        # actions = self.get_nodes(
        #     page,
        #     '//div[@class="StandardText leftDivMargin"]/'
        #     'div[@class="StandardText"][last()]//text()[normalize-space()]')

        if 'CR' in bill_id:
            bill_type = 'concurrent resolution'
        elif 'JR' in bill_id:
            bill_type = 'joint resolution'
        elif 'R' in bill_id:
            bill_type = 'resolution'
        else:
            bill_type = 'bill'

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=bill_type)
        bill.subject = self._subjects[bill_id]
        bill.add_source(url)

        bill.add_version_link(version_title, source_url, media_type=mimetype)

        self.parse_actions(page, bill, chamber)
        self.parse_subjects(page, bill)

        # LM is "Locally Mandated fiscal impact"
        fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]')
        for fiscal_note in fiscal_notes:
            source_url = fiscal_note.attrib['href']
            if source_url.endswith('.doc'):
                mimetype = 'application/msword'
            elif source_url.endswith('.pdf'):
                mimetype = 'application/pdf'

            bill.add_document_link("Fiscal Note",
                                   source_url,
                                   media_type=mimetype)

        for link in page.xpath(
                "//td/span/a[contains(@href, 'Legislator-Profile')]"):
            bill.add_sponsorship(link.text.strip(),
                                 classification='primary',
                                 entity_type='person',
                                 primary=True)

        bdr_no = self.parse_bill_field(page, 'Bill Request Number')
        if bdr_no.xpath('text()'):
            bdr = bdr_no.xpath('text()')[0].strip()
            bill.extras["BDR"] = bdr

        yield bill
Ejemplo n.º 43
0
    def scrape_bill(self, session, bill_url):
        page = self.get(bill_url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(bill_url)

        try:
            bill_id = page.xpath('//span[@id="lblBillNumber"]/a[1]')[0].text
        except IndexError:
            self.logger.warning("Something is wrong with bill page, skipping.")
            return
        secondary_bill_id = page.xpath('//span[@id="lblCompNumber"]/a[1]')

        # checking if there is a matching bill
        if secondary_bill_id:
            secondary_bill_id = secondary_bill_id[0].text
            # swap ids if * is in secondary_bill_id
            if "*" in secondary_bill_id:
                bill_id, secondary_bill_id = secondary_bill_id, bill_id
                secondary_bill_id = secondary_bill_id.strip()
            secondary_bill_id = secondary_bill_id.replace("  ", " ")

        bill_id = bill_id.replace("*", "").replace("  ", " ").strip()

        if "B" in bill_id:
            bill_type = "bill"
        elif "JR" in bill_id:
            bill_type = "joint resolution"
        elif "R" in bill_id:
            bill_type = "resolution"

        primary_chamber = "lower" if "H" in bill_id else "upper"
        # secondary_chamber = 'upper' if primary_chamber == 'lower' else 'lower'

        title = page.xpath("//span[@id='lblAbstract']")[0].text
        if title is None:
            msg = "%s detail page was missing title info."
            self.logger.warning(msg % bill_id)
            return

        # bill subject
        subject_pos = title.find("-")
        subjects = [s.strip() for s in title[: subject_pos - 1].split(",")]
        subjects = filter(None, subjects)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=primary_chamber,
            title=title,
            classification=bill_type,
        )
        for subject in subjects:
            bill.add_subject(subject)

        if secondary_bill_id:
            bill.add_identifier(secondary_bill_id)

        if page.xpath('//span[@id="lblCompNumber"]/a'):
            companion_id = page.xpath('//span[@id="lblCompNumber"]/a')[0].text_content().strip()
            bill.add_related_bill(
                identifier=companion_id,
                legislative_session=session,
                relation_type="companion",
            )

        bill.add_source(bill_url)

        # Primary Sponsor
        sponsor = (
            page.xpath("//span[@id='lblBillPrimeSponsor']")[0]
            .text_content()
            .split("by")[-1]
        )
        sponsor = sponsor.replace("*", "").strip()
        if sponsor:
            bill.add_sponsorship(
                sponsor, classification="primary", entity_type="person", primary=True
            )

        # bill text
        btext = page.xpath("//span[@id='lblBillNumber']/a")[0]
        bill.add_version_link(
            "Current Version", btext.get("href"), media_type="application/pdf"
        )

        # documents
        summary = page.xpath('//a[contains(@href, "BillSummaryArchive")]')
        if summary:
            bill.add_document_link("Summary", summary[0].get("href"))
        fiscal = page.xpath('//span[@id="lblFiscalNote"]//a')
        if fiscal:
            bill.add_document_link("Fiscal Note", fiscal[0].get("href"))
        amendments = page.xpath('//a[contains(@href, "/Amend/")]')
        for amendment in amendments:
            bill.add_document_link("Amendment " + amendment.text, amendment.get("href"))
        # amendment notes in image with alt text describing doc inside <a>
        amend_fns = page.xpath('//img[contains(@alt, "Fiscal Memo")]')
        for afn in amend_fns:
            bill.add_document_link(
                afn.get("alt"), afn.getparent().get("href"), on_duplicate="ignore"
            )

        # actions
        atable = page.xpath("//table[@id='gvBillActionHistory']")[0]
        actions_from_table(bill, atable)

        # if there is a matching bill
        if secondary_bill_id:
            # secondary sponsor
            secondary_sponsor = (
                page.xpath("//span[@id='lblCompPrimeSponsor']")[0]
                .text_content()
                .split("by")[-1]
            )
            secondary_sponsor = (
                secondary_sponsor.replace("*", "").replace(")", "").strip()
            )
            # Skip black-name sponsors.
            if secondary_sponsor:
                bill.add_sponsorship(
                    secondary_sponsor,
                    classification="primary",
                    entity_type="person",
                    primary=True,
                )

            # secondary actions
            cotable = page.xpath("//table[@id='gvCoActionHistory']")[0]
            actions_from_table(bill, cotable)

        # votes
        yield from self.scrape_vote_events(bill, page, bill_url)

        bill.actions.sort(key=lambda a: a["date"])
        yield bill
Ejemplo n.º 44
0
def test_full_bill():
    create_jurisdiction()
    sp = ScrapePerson('Adam Smith')
    org = ScrapeOrganization(name='House', classification='lower')
    com = ScrapeOrganization(name='Arbitrary Committee',
                             classification='committee',
                             parent_id=org._id)

    oldbill = ScrapeBill('HB 99',
                         '1899',
                         'Axe & Tack Tax Act',
                         classification='tax bill',
                         from_organization=org._id)

    bill = ScrapeBill('HB 1',
                      '1900',
                      'Axe & Tack Tax Act',
                      classification='tax bill',
                      from_organization=org._id)
    bill.subject = ['taxes', 'axes']
    bill.add_identifier('SB 9')
    bill.add_title('Tack & Axe Tax Act')
    bill.add_action('introduced in house', '1900-04-01', chamber='lower')
    act = bill.add_action('sent to arbitrary committee',
                          '1900-04-04',
                          chamber='lower')
    act.add_related_entity('arbitrary committee', 'organization', com._id)
    bill.add_related_bill("HB 99",
                          legislative_session="1899",
                          relation_type="prior-session")
    bill.add_sponsorship('Adam Smith',
                         classification='extra sponsor',
                         entity_type='person',
                         primary=False,
                         entity_id=sp._id)
    bill.add_sponsorship('Jane Smith',
                         classification='lead sponsor',
                         entity_type='person',
                         primary=True)
    bill.add_abstract('This is an act about axes and taxes and tacks.',
                      note="official",
                      date='1969-10-20')
    bill.add_document_link('Fiscal Note',
                           'http://example.com/fn.pdf',
                           media_type='application/pdf')
    bill.add_document_link('Fiscal Note',
                           'http://example.com/fn.html',
                           media_type='text/html')
    bill.add_version_link('Fiscal Note',
                          'http://example.com/v/1',
                          media_type='text/html')
    bill.add_source('http://example.com/source')

    # import bill
    oi = OrganizationImporter('jid')
    oi.import_data([org.as_dict(), com.as_dict()])

    pi = PersonImporter('jid')
    pi.import_data([sp.as_dict()])

    BillImporter('jid', oi,
                 pi).import_data([oldbill.as_dict(),
                                  bill.as_dict()])

    # get bill from db and assert it imported correctly
    b = Bill.objects.get(identifier='HB 1')
    assert b.from_organization.classification == 'lower'
    assert b.identifier == bill.identifier
    assert b.title == bill.title
    assert b.classification == bill.classification
    assert b.subject == ['taxes', 'axes']
    assert b.abstracts.get().note == 'official'
    assert b.abstracts.get().date == '1969-10-20'

    # other_title, other_identifier added
    assert b.other_titles.get().title == 'Tack & Axe Tax Act'
    assert b.other_identifiers.get().identifier == 'SB 9'

    # actions
    actions = list(b.actions.all())
    assert len(actions) == 2
    # ensure order was preserved (if this breaks it'll be intermittent)
    assert actions[0].organization == Organization.objects.get(
        classification='lower')
    assert actions[0].description == "introduced in house"
    assert actions[1].description == "sent to arbitrary committee"
    assert (actions[1].related_entities.get().organization ==
            Organization.objects.get(classification='committee'))

    # related_bills were added
    rb = b.related_bills.get()
    assert rb.identifier == 'HB 99'

    # and bill got resolved
    assert rb.related_bill.identifier == 'HB 99'

    # sponsors added, linked & unlinked
    sponsorships = b.sponsorships.all()
    assert len(sponsorships) == 2
    person = Person.objects.get(name='Adam Smith')
    for ss in sponsorships:
        if ss.primary:
            assert ss.person is None
            assert ss.organization is None
        else:
            assert ss.person == person

    # versions & documents with their links
    versions = b.versions.all()
    assert len(versions) == 1
    assert versions[0].links.count() == 1
    documents = b.documents.all()
    assert len(documents) == 1
    assert documents[0].links.count() == 2

    # sources
    assert b.sources.count() == 1
Ejemplo n.º 45
0
    def scrape_bill(self, bill_num, session):
        chamber_map = {'House': 'lower', 'Senate': 'upper', 'LSO': 'executive'}
        # Sample with all keys: https://gist.github.com/showerst/d6cd03eff3e8b12ab01dbb219876db45
        bill_json_url = 'http://wyoleg.gov/LsoService/api/BillInformation/{}/' \
                        '{}?calendarDate='.format(
                            session, bill_num)
        response = self.get(bill_json_url)
        bill_json = json.loads(response.content.decode('utf-8'))

        chamber = 'lower' if bill_json['bill'][0] else 'upper'

        bill = Bill(
            identifier=bill_json['bill'],
            legislative_session=session,
            title=bill_json['catchTitle'],
            chamber=chamber,
            classification="bill",
        )

        bill.add_title(bill_json['billTitle'])

        source_url = 'http://lso.wyoleg.gov/Legislation/{}/{}'.format(
            session, bill_json['bill'])
        bill.add_source(source_url)

        for action_json in bill_json['billActions']:
            utc_action_date = self.parse_local_date(action_json['statusDate'])

            actor = None
            if action_json['location'] and action_json[
                    'location'] in chamber_map:
                actor = chamber_map[action_json['location']]

            action = bill.add_action(
                chamber=actor,
                description=action_json['statusMessage'],
                date=utc_action_date,
                classification=categorize_action(action_json['statusMessage']),
            )

            action.extras = {
                'billInformationID': action_json['billInformationID']
            }

        if bill_json['introduced']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['introduced'])

            bill.add_version_link(
                note="Introduced",
                url=url,
                media_type="application/pdf"  # optional but useful!
            )

        if bill_json['enrolledAct']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['enrolledAct'])

            bill.add_version_link(
                note="Enrolled",
                url=url,
                media_type="application/pdf"  # optional but useful!
            )

        if bill_json['fiscalNote']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['fiscalNote'])

            bill.add_document_link(
                note="Fiscal Note",
                url=url,
                media_type="application/pdf"  # optional but useful!
            )

        if bill_json['digest']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['digest'])

            bill.add_document_link(
                note="Bill Digest",
                url=url,
                media_type="application/pdf"  # optional but useful!
            )

        if bill_json['vetoes']:
            for veto in bill_json['vetoes']:
                url = 'http://wyoleg.gov/{}'.format(veto['vetoLinkPath'])
                bill.add_version_link(
                    note=veto['vetoLinkText'],
                    url=url,
                    media_type="application/pdf"  # optional but useful!
                )

        for amendment in bill_json['amendments']:
            # http://wyoleg.gov/2018/Amends/SF0050H2001.pdf
            url = 'http://wyoleg.gov/{}/Amends/{}.pdf'.format(
                session, amendment['amendmentNumber'])

            if amendment['sponsor'] and amendment['status']:
                title = 'Amendment {} ({}) - {} ({})'.format(
                    amendment['amendmentNumber'],
                    amendment['order'],
                    amendment['sponsor'],
                    amendment['status'],
                )
            else:
                title = 'Amendment {} ({})'.format(
                    amendment['amendmentNumber'],
                    amendment['order'],
                )
            # add versions of the bill text
            version = bill.add_version_link(
                note=title,
                url=url,
                media_type="application/pdf",
            )
            version['extras'] = {
                'amendmentNumber': amendment['amendmentNumber'],
                'sponsor': amendment['sponsor'],
            }

        for sponsor in bill_json['sponsors']:
            status = 'primary' if sponsor['primarySponsor'] else 'cosponsor'
            sponsor_type = 'person' if sponsor[
                'sponsorTitle'] else 'organization'
            bill.add_sponsorship(name=sponsor['name'],
                                 classification=status,
                                 entity_type=sponsor_type,
                                 primary=sponsor['primarySponsor'])

        if bill_json['summary']:
            bill.add_abstract(
                note="summary",
                abstract=bill_json['summary'],
            )

        if bill_json['enrolledNumber']:
            bill.extras['wy_enrolled_number'] = bill_json['enrolledNumber']

        if bill_json['chapter']:
            bill.extras['chapter'] = bill_json['chapter']

        if bill_json['effectiveDate']:
            eff = datetime.datetime.strptime(bill_json['effectiveDate'],
                                             '%m/%d/%Y')
            bill.extras['effective_date'] = eff.strftime('%Y-%m-%d')

        bill.extras['wy_bill_id'] = bill_json['id']

        for vote_json in bill_json['rollCalls']:
            yield from self.scrape_vote(bill, vote_json, session)

        yield bill
    def scrape_events_range(self, start_date, end_date):
        def daterange(start_date, end_date):
            number_of_days = int((end_date - start_date).days)
            for n in range(number_of_days):
                yield start_date + dt.timedelta(n)

        for date in daterange(start_date, end_date):
            calendar_day_url = CALENDAR_DAY_TEMPLATE.format(date.year, date.month - 1, date.day)
            events = self.extract_events_by_url(calendar_day_url)
            for event in events:
                tz = pytz.timezone("America/Toronto")
                time = dt.datetime.strptime(event["time"], "%I:%M %p")
                start = tz.localize(date.replace(hour=time.hour, minute=time.minute, second=0, microsecond=0))
                org_name = event["meeting"]
                e = Event(
                    name=org_name,
                    start_time=start,
                    timezone=tz.zone,
                    location_name=event["location"],
                    status=STATUS_DICT.get(event["meeting_status"]),
                )
                e.extras = {"meeting_number": event["no"], "tmmis_meeting_id": event["meeting_id"]}
                e.add_source(calendar_day_url)
                e.add_participant(name=org_name, type="organization")

                def is_agenda_available(event):
                    return event["publishing_status"] in ["Agenda Published", "Minutes Published"]

                def is_council(event):
                    return True if event["meeting"] == self.jurisdiction.name else False

                if is_agenda_available(event):
                    agenda_url_template = (
                        AGENDA_FULL_COUNCIL_TEMPLATE if is_council(event) else AGENDA_FULL_STANDARD_TEMPLATE
                    )
                    agenda_url = agenda_url_template.format(event["meeting_id"])
                    full_identifiers = list(self.full_identifiers(event["meeting_id"], is_council(event)))

                    event_map_url_template = (
                        "http://app.toronto.ca/tmmis/getAddressList.do?function=getMeetingAddressList&meetingId={}"
                    )
                    event_map_url = event_map_url_template.format(event["meeting_id"])
                    addresses_d = self.addressesByAgendaId(event_map_url)

                    e.add_source(agenda_url)
                    agenda_items = self.agenda_from_url(agenda_url)
                    for i, item in enumerate(agenda_items):

                        a = e.add_agenda_item(item["title"])
                        a.add_classification(item["type"].lower())
                        a["order"] = str(i)

                        def normalize_wards(raw):
                            if not raw:
                                raw = "All"
                            if raw == "All":
                                return raw.lower()
                            else:
                                return raw.split(", ")

                        wards = normalize_wards(item["wards"])
                        identifier_regex = re.compile(r"^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$")
                        [full_identifier] = [
                            id for id in full_identifiers if identifier_regex.match(id).group(1) == item["identifier"]
                        ]
                        a.add_bill(full_identifier)
                        if full_identifier not in self.seen_agenda_items:
                            b = Bill(
                                # TODO: Fix this hardcode
                                legislative_session="2014-2018",
                                identifier=full_identifier,
                                title=item["title"],
                                from_organization={"name": self.jurisdiction.name},
                            )
                            b.add_source(agenda_url)
                            b.add_document_link(
                                note="canonical",
                                media_type="text/html",
                                url=AGENDA_ITEM_TEMPLATE.format(full_identifier),
                            )
                            b.extras["wards"] = wards

                            addresses = addresses_d.get(full_identifier)
                            if addresses:
                                b.extras["locations"] = []
                                for address in addresses:
                                    location = {"address": {"full_address": address}}
                                    b.extras["locations"].append(location)

                            self.seen_agenda_items.append(full_identifier)

                            yield b

                yield e
Ejemplo n.º 47
0
    def scrape_bill(self, chamber, session, bill_id, url):
        try:
            page = lxml.html.fromstring(self.get(url).text)
        except scrapelib.HTTPError as e:
            self.warning("error (%s) fetching %s, skipping" % (e, url))
            return

        title = page.xpath(
            "string(//span[contains(@id, 'PlaceHolder1_txtST')])"
        ).strip()
        if not title:
            self.warning("blank bill on %s - skipping", url)
            return

        if "JR" in bill_id:
            bill_type = ["joint resolution"]
        elif "CR" in bill_id:
            bill_type = ["concurrent resolution"]
        elif "R" in bill_id:
            bill_type = ["resolution"]
        else:
            bill_type = ["bill"]

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )
        bill.add_source(url)
        bill.subject = self.subject_map[bill_id]

        for link in page.xpath("//a[contains(@id, 'Auth')]"):
            name = link.xpath("string()").strip()

            if ":" in name:
                raise Exception(name)
            if "otherAuth" in link.attrib["id"]:
                bill.add_sponsorship(
                    name,
                    classification="cosponsor",
                    entity_type="person",
                    primary=False,
                )
            else:
                bill.add_sponsorship(
                    name, classification="primary", entity_type="person", primary=True
                )

        act_table = page.xpath("//table[contains(@id, 'Actions')]")[0]
        for tr in act_table.xpath("tr")[2:]:
            action = tr.xpath("string(td[1])").strip()
            if not action or action == "None":
                continue

            date = tr.xpath("string(td[3])").strip()
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            actor = tr.xpath("string(td[4])").strip()
            if actor == "H":
                actor = "lower"
            elif actor == "S":
                actor = "upper"

            attrs = self.categorizer.categorize(action)
            related_entities = []
            for item in attrs["committees"]:
                related_entities.append({"type": "committee", "name": item})
            for item in attrs["legislators"]:
                related_entities.append({"type": "legislator", "name": item})
            bill.add_action(
                description=action,
                date=date.strftime("%Y-%m-%d"),
                chamber=actor,
                classification=attrs["classification"],
                related_entities=related_entities,
            )

        version_table = page.xpath("//table[contains(@id, 'Versions')]")[0]
        # Keep track of already seen versions to prevent processing duplicates.
        version_urls = []
        for link in version_table.xpath(".//a[contains(@href, '.PDF')]"):
            version_url = link.attrib["href"]
            if version_url in version_urls:
                self.warning("Skipping duplicate version URL.")
                continue
            else:
                version_urls.append(version_url)
            name = link.text.strip()

            if re.search("COMMITTEE REPORTS|SCHEDULED CCR", version_url, re.IGNORECASE):
                bill.add_document_link(
                    note=name, url=version_url, media_type="application/pdf"
                )
                continue

            bill.add_version_link(
                note=name, url=version_url, media_type="application/pdf"
            )

        self.scrape_amendments(bill, page)

        for link in page.xpath(".//a[contains(@href, '_VOTES')]"):
            if "HT_" not in link.attrib["href"]:
                yield from self.scrape_votes(bill, self.urlescape(link.attrib["href"]))

        # # If the bill has no actions and no versions, it's a bogus bill on
        # # their website, which appears to happen occasionally. Skip.
        has_no_title = bill.title == "Short Title Not Found."
        if has_no_title:
            # If there's no title, this is an empty page. Skip!
            return

        else:
            # Otherwise, save the bills.
            yield bill
Ejemplo n.º 48
0
    def scrape(self, window=28, matter_ids=None):
        '''By default, scrape board reports updated in the last 28 days.
        Optionally specify a larger or smaller window of time from which to
        scrape updates, or specific matters to scrape.
        Note that passing a value for :matter_ids supercedes the value of
        :window, such that the given matters will be scraped regardless of
        when they were updated.
        
        Optional parameters
        :window (numeric) - Amount of time for which to scrape updates, e.g.
        a window of 7 will scrape legislation updated in the last week. Pass
        a window of 0 to scrape all legislation.
        :matter_ids (str) - Comma-separated list of matter IDs to scrape
        '''

        if matter_ids:
            matters = [
                self.matter(matter_id) for matter_id in matter_ids.split(',')
            ]
            matters = filter(
                None, matters)  # Skip matters that are not yet in Legistar
        elif float(window):  # Support for partial days, i.e., window=0.15
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
                float(window))
            matters = self.matters(n_days_ago)
        else:
            # Scrape all matters, including those without a last-modified date
            matters = self.matters()

        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
            float(window))
        for matter in matters:
            # If this Boolean field is True, then do not scrape the Bill.
            # This issue explains why a restricted Bill might appear (unwelcome) in the Legistar API:
            # https://github.com/datamade/la-metro-councilmatic/issues/345#issuecomment-421184826
            if matter['MatterRestrictViewViaWeb']:
                continue

            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)):
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Board of Directors"})

            legistar_web = matter['legistar_url']

            legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id):
                act = bill.add_action(**action)

                if action['description'] == 'Referred':
                    body_name = matter['MatterBodyName']
                    act.add_related_entity(
                        body_name,
                        'organization',
                        entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote
                if result:
                    vote_event = VoteEvent(
                        legislative_session=bill.legislative_session,
                        motion_text=action['description'],
                        organization=action['organization'],
                        classification=None,
                        start_date=action['date'],
                        result=result,
                        bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes:
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(
                            raw_option, raw_option)
                        vote_event.vote(clean_option,
                                        vote['VotePersonName'].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic['MatterIndexName'].strip())

            for relation in self.relations(matter_id):
                try:
                    # Get data (i.e., json) for the related bill.
                    # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session).
                    # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue.
                    related_bill = self.endpoint(
                        '/matters/{0}', relation['MatterRelationMatterId'])
                except scrapelib.HTTPError:
                    continue
                else:
                    date = related_bill['MatterIntroDate']
                    related_bill_session = self.session(self.toTime(date))
                    identifier = related_bill['MatterFile']
                    bill.add_related_bill(
                        identifier=identifier,
                        legislative_session=related_bill_session,
                        relation_type='companion')
                    # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104
                    # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'.

            bill.add_version_link(
                'Board Report',
                'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'
                .format(matter_id),
                media_type="application/pdf")

            for attachment in self.attachments(matter_id):
                if attachment['MatterAttachmentName']:
                    bill.add_document_link(
                        attachment['MatterAttachmentName'],
                        attachment['MatterAttachmentHyperlink'],
                        media_type="application/pdf")

            bill.extras = {'local_classification': matter['MatterTypeName']}

            text = self.text(matter_id)

            if text:
                if text['MatterTextPlain']:
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf']:
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                        u'\u0000', '')

            yield bill
Ejemplo n.º 49
0
    def scrape_bill(self, chamber, session, bill_id, short_title=None):
        """
        Scrapes documents, actions, vote counts and votes for
        bills from the 2009 session and above.
        """
        url = BILL_URL % (session, bill_id.replace(' ', ''))
        bill_page = self.get(url, verify=False).text
        html = lxml.html.fromstring(bill_page)
        html.make_links_absolute('http://legislature.idaho.gov/legislation/%s/' % session)
        bill_tables = html.xpath('//table[contains(@class, "bill-table")]')
        title = bill_tables[1].text_content().strip()
        bill_type = get_bill_type(bill_id)
        bill = Bill(legislative_session=session, chamber=chamber, identifier=bill_id, title=title,
                    classification=bill_type)
        bill.add_source(url)
        for subject in self._subjects[bill_id.replace(' ', '')]:
            bill.add_subject(subject)

        if short_title and title.lower() != short_title.lower():
            bill.add_title(short_title, 'short title')

        # documents
        doc_links = html.xpath('//div[contains(@class,"pf-content")]//a')
        for link in doc_links:
            name = link.text_content().strip()
            href = link.get('href')
            if 'Engrossment' in name or 'Bill Text' in name:
                bill.add_version_link(note=name, url=href, media_type="application/pdf")
            else:
                bill.add_document_link(note=name, url=href, media_type="application/pdf")

        def _split(string):
            return re.split(r"\w+[,|AND]\s+", string)

        # sponsors range from a committee to one legislator to a group of legs
        sponsor_lists = bill_tables[0].text_content().split('by')
        if len(sponsor_lists) > 1:
            for sponsors in sponsor_lists[1:]:
                if 'COMMITTEE' in sponsors.upper():
                    bill.add_sponsorship(name=sponsors.strip(), entity_type="organization",
                                         primary=True, classification='primary')
                else:
                    for person in _split(sponsors):
                        person = person.strip()
                        if person != "":
                            bill.add_sponsorship(classification='primary', name=person,
                                                 entity_type="person", primary=True)

        actor = chamber
        last_date = None
        for row in bill_tables[2]:
            # lots of empty rows
            if len(row) == 1:
                continue
            _, date, action, _ = [x.text_content().strip() for x in row]

            if date:
                last_date = date
            else:
                date = last_date
            date = datetime.datetime.strptime(date + '/' + session[0:4],
                                              "%m/%d/%Y").strftime('%Y-%m-%d')
            if action.startswith('House'):
                actor = 'lower'
            elif action.startswith('Senate'):
                actor = 'upper'

            # votes
            if 'AYES' in action or 'NAYS' in action:
                yield from self.parse_vote(actor, date, row[2], session, bill_id, chamber, url)
                # bill.add_vote_event(vote)
            # some td's text is seperated by br elements
            if len(row[2]):
                action = "".join(row[2].itertext())
            action = action.replace(u'\xa0', ' ').strip()
            atype = get_action(actor, action)
            bill.add_action(action, date, chamber=actor, classification=atype)
            # after voice vote/roll call and some actions the bill is sent
            # 'to House' or 'to Senate'
            if 'to House' in action:
                actor = 'lower'
            elif 'to Senate' in action:
                actor = 'upper'
        yield bill
Ejemplo n.º 50
0
    def get_bill(self, matter):
        '''Make Bill object from given matter.'''
        
        '''
        Currently, NYC Legistar does not have conventional "Types" for 
        three newly added committees: https://legistar.council.nyc.gov/Departments.aspx
        We communicated the issue to NYC, and until we learn more, we will
        skip the bills attached to those committees.
        '''
        orgs_without_type = ['Charter Revision Commission 2019',
                             'New York City Advisory Commission on Property Tax Reform',
                             'Democratic Conference of the Council of the City of New York']
        if matter['MatterBodyName'].strip() in orgs_without_type:
            return None

        matter_id = matter['MatterId']
        if matter_id in DUPLICATED_ACTIONS:
            return None

        date = matter['MatterIntroDate']
        title = matter['MatterName']
        identifier = matter['MatterFile']

        if not all((date, title, identifier)):
            return None

        leg_type = BILL_TYPES[matter['MatterTypeName']]

        bill_session = self.sessions(self.toTime(date))

        bill = Bill(identifier=identifier,
                    title=title,
                    classification=leg_type,
                    legislative_session=bill_session,
                    from_organization={"name": "New York City Council"})

        legistar_web = matter['legistar_url']
        legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)

        bill.add_source(legistar_web, note='web')
        bill.add_source(legistar_api, note='api')

        if matter['MatterTitle']:
            bill.add_title(matter['MatterTitle'])

        if matter['MatterEXText5']:
            bill.add_abstract(matter['MatterEXText5'], note='')

        try:
            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)
        except KeyError:
            self.version_errors.append(legistar_web)
            return None

        for attachment in self.attachments(matter_id):

            if attachment['MatterAttachmentId'] == 103315:  # Duplicate
                return None

            if attachment['MatterAttachmentName']:
                bill.add_document_link(attachment['MatterAttachmentName'],
                                       attachment['MatterAttachmentHyperlink'],
                                       media_type='application/pdf')

        for topic in self.topics(matter_id) :
            bill.add_subject(topic['MatterIndexName'].strip())

        for relation in self.relations(matter_id):
            try:
                related_bill = self.endpoint('/matters/{0}', relation['MatterRelationMatterId'])
            except scrapelib.HTTPError:
                return None
            else:
                date = related_bill['MatterIntroDate']
                related_bill_session = self.session(self.toTime(date))
                identifier = related_bill['MatterFile']
                bill.add_related_bill(identifier=identifier,
                                      legislative_session=related_bill_session,
                                      relation_type='companion')

        try:
            text = self.text(matter_id)
        except KeyError:
            self.version_errors.append(legistar_web)
            return None

        bill.extras['local_classification'] = matter['MatterTypeName']

        if text:
            if text['MatterTextPlain']:
                bill.extras['plain_text'] = text['MatterTextPlain'].replace(u'\u0000', '')

            if text['MatterTextRtf']:
                bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '')

        return bill
Ejemplo n.º 51
0
    def scrape(self):
        for leg_summary in self.legislation(
                created_after=datetime.datetime(2014, 1, 1)):
            leg_type = BILL_TYPES[leg_summary['Type']]

            bill = Bill(identifier=leg_summary['File\xa0#'],
                        title=leg_summary['Title'],
                        legislative_session=None,
                        classification=leg_type,
                        from_organization={"name": "New York City Council"})
            bill.add_source(leg_summary['url'])

            leg_details = self.legDetails(leg_summary['url'])
            history = self.history(leg_summary['url'])

            bill.add_title(leg_details['Name'],
                           note='created by administrative staff')

            if 'Summary' in leg_details:
                bill.add_abstract(leg_details['Summary'], note='')

            if leg_details['Law number']:
                bill.add_identifier(leg_details['Law number'],
                                    note='law number')

            for sponsorship in self._sponsors(leg_details.get('Sponsors', [])):
                sponsor, sponsorship_type, primary = sponsorship
                bill.add_sponsorship(sponsor,
                                     sponsorship_type,
                                     'person',
                                     primary,
                                     entity_id=_make_pseudo_id(name=sponsor))

            for attachment in leg_details.get('Attachments', []):
                bill.add_document_link(attachment['label'],
                                       attachment['url'],
                                       media_type="application/pdf")

            history = list(history)

            if history:
                earliest_action = min(
                    self.toTime(action['Date']) for action in history)

                bill.legislative_session = self.sessions(earliest_action)
            else:
                bill.legislative_session = str(self.SESSION_STARTS[0])

            for action in history:
                action_description = action['Action']
                if not action_description:
                    continue

                action_class = ACTION_CLASSIFICATION[action_description]

                action_date = self.toDate(action['Date'])
                responsible_org = action['Action\xa0By']
                if responsible_org == 'City Council':
                    responsible_org = 'New York City Council'
                elif responsible_org == 'Administration':
                    responsible_org = 'Mayor'

                if responsible_org == 'Town Hall Meeting':
                    continue
                else:
                    act = bill.add_action(
                        action_description,
                        action_date,
                        organization={'name': responsible_org},
                        classification=action_class)

                if 'url' in action['Action\xa0Details']:
                    action_detail_url = action['Action\xa0Details']['url']
                    if action_class == 'committee-referral':
                        action_details = self.actionDetails(action_detail_url)
                        referred_committee = action_details[
                            'Action text'].rsplit(' to the ', 1)[-1]
                        act.add_related_entity(
                            referred_committee,
                            'organization',
                            entity_id=_make_pseudo_id(name=referred_committee))
                    result, votes = self.extractVotes(action_detail_url)
                    if votes:
                        action_vote = VoteEvent(
                            legislative_session=bill.legislative_session,
                            motion_text=action_description,
                            organization={'name': responsible_org},
                            classification=action_class,
                            start_date=action_date,
                            result=result,
                            bill=bill)
                        action_vote.add_source(action_detail_url)

                        for option, voter in votes:
                            action_vote.vote(option, voter)

                        yield action_vote

            text = self.text(leg_summary['url'])

            if text:
                bill.extras = {
                    'local_classification': leg_summary['Type'],
                    'full_text': text
                }
            else:
                bill.extras = {'local_classification': leg_summary['Type']}

            yield bill
Ejemplo n.º 52
0
    def scrape_bill(self, chamber, session, bill_id):
        # try and get bill for the first year of the session biennium
        url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % (
            session[:4], bill_id.replace(' ', '-'))
        html = self.get(url).text
        # Otherwise, try second year of the session biennium
        if ('Page Not Found' in html or
                'The bill you are looking for is not available yet' in html):
            url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % (
                session[-4:], bill_id.replace(' ', '-'))
            html = self.get(url).text
            if ('Page Not Found' in html or
                    'The bill you are looking for is not available yet' in html):
                self.warning("Cannot open bill page for {}; skipping".format(bill_id))
                return

        doc = lxml.html.fromstring(html)
        doc.make_links_absolute('http://legislature.mi.gov')

        title = doc.xpath('//span[@id="frg_billstatus_ObjectSubject"]')[0].text_content()

        # get B/R/JR/CR part and look up bill type
        bill_type = bill_types[bill_id.split(' ')[0][1:]]

        bill = Bill(bill_id, session, title, chamber=chamber,
                    classification=bill_type)
        bill.add_source(url)

        # sponsors
        sponsors = doc.xpath('//span[@id="frg_billstatus_SponsorList"]/a')
        for sponsor in sponsors:
            name = sponsor.text.replace(u'\xa0', ' ')
            # sometimes district gets added as a link
            if name.isnumeric():
                continue

            if len(sponsors) > 1:
                classification = (
                    'primary'
                    if sponsor.tail and 'primary' in sponsor.tail
                    else 'cosponsor'
                )
            else:
                classification = 'primary'
            bill.add_sponsorship(
                name=name,
                chamber=chamber,
                entity_type='person',
                primary=classification == 'primary',
                classification=classification,
            )

        bill.subject = doc.xpath('//span[@id="frg_billstatus_CategoryList"]/a/text()')

        # actions (skip header)
        for row in doc.xpath('//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]:
            tds = row.xpath('td')  # date, journal link, action
            date = tds[0].text_content()
            journal = tds[1].text_content()
            action = tds[2].text_content()
            date = TIMEZONE.localize(datetime.datetime.strptime(date, "%m/%d/%Y"))
            # instead of trusting upper/lower case, use journal for actor
            actor = 'upper' if 'SJ' in journal else 'lower'
            classification = categorize_action(action)
            bill.add_action(action, date, chamber=actor, classification=classification)

            # check if action mentions a sub
            submatch = re.search(r'WITH SUBSTITUTE\s+([\w\-\d]+)', action, re.IGNORECASE)
            if submatch and tds[2].xpath('a'):
                version_url = tds[2].xpath('a/@href')[0]
                version_name = tds[2].xpath('a/text()')[0].strip()
                version_name = 'Substitute {}'.format(version_name)
                self.info("Found Substitute {}".format(version_url))
                if version_url.lower().endswith('.pdf'):
                    mimetype = 'application/pdf'
                elif version_url.lower().endswith('.htm'):
                    mimetype = 'text/html'
                bill.add_version_link(version_name, version_url, media_type=mimetype)

            # check if action mentions a vote
            rcmatch = re.search(r'Roll Call # (\d+)', action, re.IGNORECASE)
            if rcmatch:
                rc_num = rcmatch.groups()[0]
                # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011
                journal_link = tds[1].xpath('a/@href')
                if journal_link:
                    objectname = journal_link[0].rsplit('=', 1)[-1]
                    chamber_name = {'upper': 'Senate', 'lower': 'House'}[actor]
                    vote_url = BASE_URL + '/documents/%s/Journal/%s/htm/%s.htm' % (
                        session, chamber_name, objectname)
                    results = self.parse_roll_call(vote_url, rc_num)

                    if results is not None:
                        vote_passed = len(results['yes']) > len(results['no'])
                        vote = VoteEvent(
                            start_date=date,
                            chamber=actor,
                            bill=bill,
                            motion_text=action,
                            result='pass' if vote_passed else 'fail',
                            classification='passage',
                        )

                        # check the expected counts vs actual
                        count = re.search(r'YEAS (\d+)', action, re.IGNORECASE)
                        count = int(count.groups()[0]) if count else 0
                        if count != len(results['yes']):
                            self.warning('vote count mismatch for %s %s, %d != %d' %
                                         (bill_id, action, count, len(results['yes'])))
                        count = re.search(r'NAYS (\d+)', action, re.IGNORECASE)
                        count = int(count.groups()[0]) if count else 0
                        if count != len(results['no']):
                            self.warning('vote count mismatch for %s %s, %d != %d' %
                                         (bill_id, action, count, len(results['no'])))

                        vote.set_count('yes', len(results['yes']))
                        vote.set_count('no', len(results['no']))
                        vote.set_count('other', len(results['other']))

                        for name in results['yes']:
                            vote.yes(name)
                        for name in results['no']:
                            vote.no(name)
                        for name in results['other']:
                            vote.vote('other', name)

                        vote.add_source(vote_url)
                        yield vote
                else:
                    self.warning("missing journal link for %s %s" %
                                 (bill_id, journal))

        # versions
        for row in doc.xpath('//table[@id="frg_billstatus_DocumentGridTable"]/tr'):
            parsed = self.parse_doc_row(row)
            if parsed:
                name, url = parsed
                if url.endswith('.pdf'):
                    mimetype = 'application/pdf'
                elif url.endswith('.htm'):
                    mimetype = 'text/html'
                bill.add_version_link(name, url, media_type=mimetype)

        # documents
        for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                name, url = document
                bill.add_document_link(name, url)
        for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                name, url = document
                bill.add_document_link(name, url)

        yield bill