Beispiel #1
0
    def scrape_current(self, chamber, term):
        chamber_name = 'Senate' if chamber == 'upper' else 'House'
        chamber_letter = chamber_name[0]
        # perhaps we should save this data so we can make one request for both?
        with self.urlopen(ksapi.url + 'bill_status/') as bill_request:
            bill_request_json = json.loads(bill_request)
            bills = bill_request_json['content']
            for bill_data in bills:

                bill_id = bill_data['BILLNO']

                # filter other chambers
                if not bill_id.startswith(chamber_letter):
                    continue

                if 'CR' in bill_id:
                    btype = 'concurrent resolution'
                elif 'R' in bill_id:
                    btype = 'resolution'
                elif 'B' in bill_id:
                    btype = 'bill'

                # main
                bill = Bill(term, chamber, bill_id, bill_data['SHORTTITLE'],
                            type=btype, status=bill_data['STATUS'])
                bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower())

                if bill_data['LONGTITLE']:
                    bill.add_title(bill_data['LONGTITLE'])

                for sponsor in bill_data['SPONSOR_NAMES']:
                    stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1
                             else 'cosponsor')
                    bill.add_sponsor(stype, sponsor)

                # history is backwards
                for event in reversed(bill_data['HISTORY']):

                    actor = ('upper' if event['chamber'] == 'Senate'
                             else 'lower')

                    date = datetime.datetime.strptime(event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S")
                    # append committee names if present
                    if 'committee_names' in event:
                        action = (event['status'] + ' ' +
                                  ' and '.join(event['committee_names']))
                    else:
                        action = event['status']

                    if event['action_code'] not in ksapi.action_codes:
                        self.warning('unknown action code on %s: %s %s' %
                                     (bill_id, event['action_code'],
                                      event['status']))
                        atype = 'other'
                    else:
                        atype = ksapi.action_codes[event['action_code']]
                    bill.add_action(actor, action, date, type=atype)

                self.scrape_html(bill)
                self.save_bill(bill)
Beispiel #2
0
    def process_bill(self, data):
        chamber = parse_psuedo_id(data['from_organization'])['classification']
        if chamber == 'legislature':
            chamber = 'upper'
        bill = Bill(data['legislative_session'], chamber, data['identifier'],
                    data['title'], subjects=data['subject'],
                    type=data['classification'])
        if data['abstracts']:
            bill['summary'] = data['abstracts'][0]['abstract']
        bill.update(**data['extras'])

        for action in data['actions']:
            actor = parse_psuedo_id(action['organization_id'])['classification']
            legislators = []
            committees = []
            for rel in action['related_entities']:
                if rel['entity_type'] == 'organization':
                    committees.append(rel['name'])
                elif rel['entity_type'] == 'person':
                    legislators.append(rel['name'])
            bill.add_action(actor,
                            action['description'],
                            parse_date(action['date']),
                            type=_action_categories(action['classification']),
                            committees=committees,
                            legislators=legislators,
                            **action.get('extras', {}),
                            )

        for source in data['sources']:
            bill.add_source(source['url'])

        for sponsor in data['sponsorships']:
            bill.add_sponsor(sponsor['classification'],
                             sponsor['name'],
                             )

        for version in data['versions']:
            for link in version['links']:
                bill.add_version(version['note'], link['url'],
                                 mimetype=link['media_type'],
                                 date=parse_date(version['date']),
                                 **version.get('extras', {}))

        for doc in data['documents']:
            for link in doc['links']:
                bill.add_document(doc['note'], link['url'],
                                  mimetype=link['media_type'],
                                  date=parse_date(doc['date']),
                                  **doc.get('extras', {}))

        for title in data['other_titles']:
            bill.add_title(title['title'])

        for related in data['related_bills']:
            bill.add_companion(related['identifier'],
                               related['legislative_session'],
                               chamber
                               )
        self.save_bill(bill)
Beispiel #3
0
    def scrape_current(self, chamber, term):
        chamber_name = "Senate" if chamber == "upper" else "House"
        with self.urlopen(
            ksapi.url + "bill_status/"
        ) as bill_request:  # perhaps we should save this data so we can make on request for both chambers?
            bill_request_json = json.loads(bill_request)
            bills = bill_request_json["content"]
            for bill_data in bills:
                # filtering out other chambers
                bill_equal_chamber = False
                for history in bill_data["HISTORY"]:
                    if history["chamber"] == chamber_name:
                        bill_is_in_chamber = True
                if not bill_is_in_chamber:
                    continue

                    # main
                bill = Bill(term, chamber, bill_data["BILLNO"], bill_data["SHORTTITLE"])
                bill.add_source(ksapi.url + "bill_status/" + bill_data["BILLNO"].lower())
                if bill_data["LONGTITLE"]:
                    bill.add_title(bill_data["LONGTITLE"])
                bill.add_document("apn", ksapi.ksleg + bill_data["apn"])
                bill.add_version("Latest", ksapi.ksleg + bill_data["apn"])

                for sponsor in bill_data["SPONSOR_NAMES"]:
                    bill.add_sponsor("primary" if len(bill_data["SPONSOR_NAMES"]) == 1 else "cosponsor", sponsor)

                for event in bill_data["HISTORY"]:
                    if "committee_names" in event and "conferee_names" in event:
                        actor = " and ".join(bill_data["committee_names"] + bill_data["conferee_names"])
                    elif "committee_names" in history:
                        actor = " and ".join(bill_data["committee_names"])
                    elif "conferee_names" in history:
                        actor = " and ".join(bill_data["conferee_names"])
                    else:
                        actor = "upper" if chamber == "Senate" else "lower"

                    date = datetime.datetime.strptime(event["occurred_datetime"], "%Y-%m-%dT%H:%M:%S")
                    bill.add_action(actor, event["status"], date)

                    if event["action_code"] in ksapi.voted:
                        votes = votes_re.match(event["status"])
                        if votes:
                            vote = Vote(
                                chamber,
                                date,
                                votes.group(1),
                                event["action_code"] in ksapi.passed,
                                int(votes.group(2)),
                                int(votes.group(3)),
                                0,
                            )
                            vote.add_source(ksapi.ksleg + "bill_status/" + bill_data["BILLNO"].lower())
                            bill.add_vote(vote)

                self.save_bill(bill)
Beispiel #4
0
    def process_bill(self, data):
        chamber = parse_psuedo_id(data['from_organization'])['classification']
        bill = Bill(data['legislative_session'],
                    chamber,
                    data['identifier'],
                    data['title'],
                    subjects=data['subject'],
                    type=data['classification'])
        if data['abstracts']:
            bill['summary'] = data['abstracts'][0]['abstract']
        bill.update(**data['extras'])

        for action in data['actions']:
            actor = parse_psuedo_id(
                action['organization_id'])['classification']
            bill.add_action(actor,
                            action['description'],
                            parse_date(action['date']),
                            type=_action_categories(action['classification']))
            # TODO: related entities

        for source in data['sources']:
            bill.add_source(source['url'])

        for sponsor in data['sponsorships']:
            bill.add_sponsor(
                sponsor['classification'],
                sponsor['name'],
            )

        for version in data['versions']:
            for link in version['links']:
                bill.add_version(version['note'],
                                 link['url'],
                                 mimetype=link['media_type'],
                                 date=parse_date(version['date']))

        for doc in data['documents']:
            for link in doc['links']:
                bill.add_document(doc['note'],
                                  link['url'],
                                  mimetype=link['media_type'],
                                  date=parse_date(doc['date']))

        for title in data['other_titles']:
            bill.add_title(title)

        # TODO: related bills
        # for related in data['related_bills']:

        self.save_bill(bill)
Beispiel #5
0
    def process_bill(self, data):
        chamber = parse_psuedo_id(data['from_organization'])['classification']
        bill = Bill(data['legislative_session'], chamber, data['identifier'],
                    data['title'], subjects=data['subject'],
                    type=data['classification'])
        if data['abstracts']:
            bill['summary'] = data['abstracts'][0]['abstract']
        bill.update(**data['extras'])

        for action in data['actions']:
            actor = parse_psuedo_id(action['organization_id'])['classification']
            bill.add_action(actor,
                            action['description'],
                            parse_date(action['date']),
                            type=_action_categories(action['classification']))
            # TODO: related entities

        for source in data['sources']:
            bill.add_source(source['url'])

        for sponsor in data['sponsorships']:
            bill.add_sponsor(sponsor['classification'],
                             sponsor['name'],
                             )

        for version in data['versions']:
            for link in version['links']:
                bill.add_version(version['note'], link['url'],
                                 mimetype=link['media_type'],
                                 date=parse_date(version['date']))

        for doc in data['documents']:
            for link in doc['links']:
                bill.add_document(doc['note'], link['url'],
                                  mimetype=link['media_type'],
                                  date=parse_date(doc['date']))

        for title in data['other_titles']:
            bill.add_title(title)

        # TODO: related bills
        # for related in data['related_bills']:

        self.save_bill(bill)
Beispiel #6
0
    def scrape(self, session, chambers):
        #get member id matching for vote parsing
        member_ids = self.get_member_ids()[session]
        per_page = 10 #seems like it gives me 10 no matter what.
        start_record = 0

        headers = {"Content-Type":"application/json"}
        url = "http://lims.dccouncil.us/_layouts/15/uploader/AdminProxy.aspx/GetPublicAdvancedSearch"
        bill_url = "http://lims.dccouncil.us/_layouts/15/uploader/AdminProxy.aspx/GetPublicData"
        params = {"request":{"sEcho":2,"iColumns":4,"sColumns":"","iDisplayStart":0,"iDisplayLength":per_page,"mDataProp_0":"ShortTitle","mDataProp_1":"Title","mDataProp_2":"LegislationCategories","mDataProp_3":"Modified","iSortCol_0":0,"sSortDir_0":"asc","iSortingCols":0,"bSortable_0":"true","bSortable_1":"true","bSortable_2":"true","bSortable_3":"true"},"criteria":{"Keyword":"","Category":"","SubCategoryId":"","RequestOf":"","CouncilPeriod":str(session),"Introducer":"","CoSponsor":"","CommitteeReferral":"","CommitteeReferralComments":"","StartDate":"","EndDate":"","QueryLimit":100,"FilterType":"","Phases":"","LegislationStatus":"0","IncludeDocumentSearch":"false"}}
        param_json = json.dumps(params)
        response = self.post(url,headers=headers,data=param_json)
        #the response is a terrible string-of-nested-json-strings. Yuck.
        response = self.decode_json(response.json()["d"])
        data = response["aaData"]
        
        global bill_versions

        while len(data) > 0:

            for bill in data:
                bill_versions = [] #sometimes they're in there more than once, so we'll keep track

                bill_id = bill["Title"]
                if bill_id.startswith("AG"):
                    #actually an agenda, skip
                    continue
                bill_params = {"legislationId":bill_id}
                bill_info = self.post(bill_url,headers=headers,data=json.dumps(bill_params))
                bill_info = self.decode_json(bill_info.json()["d"])["data"]
                bill_source_url = "http://lims.dccouncil.us/Legislation/"+bill_id


                legislation_info = bill_info["Legislation"][0]
                title = legislation_info["ShortTitle"]
                
                
                
                if bill_id.startswith("R") or bill_id.startswith("CER"):
                    bill_type = "resolution"
                else:
                    bill_type = "bill"
                
                #dc has no chambers. calling it all upper
                bill = Bill(session,"upper", bill_id, title, type=bill_type)

                #sponsors and cosponsors
                introducers = legislation_info["Introducer"]
                try:
                    #sometimes there are cosponsors, sometimes not.
                    cosponsors = legislation_info["CoSponsor"]
                except KeyError:
                    cosponsors = []
                for i in introducers:
                    sponsor_name = i["Name"]
                    #they messed up Phil Mendelson's name
                    if sponsor_name == "Phil Pmendelson":
                        sponsor_name = "Phil Mendelson"
                    bill.add_sponsor(name=sponsor_name,type="primary")
                for s in cosponsors:
                    sponsor_name = s["Name"]
                    if sponsor_name == "Phil Pmendelson":
                        sponsor_name = "Phil Mendelson"
                    bill.add_sponsor(name=sponsor_name,type="cosponsor")


                #if it's become law, add the law number as an alternate title
                if "LawNumber" in legislation_info:
                    law_num = legislation_info["LawNumber"]
                    if law_num:
                        bill.add_title(law_num)

                #also sometimes it's got an act number
                if "ActNumber" in legislation_info:
                    act_num = legislation_info["ActNumber"]
                    if act_num:
                        bill.add_title(act_num)

                #sometimes AdditionalInformation has a previous bill name
                if "AdditionalInformation" in legislation_info:
                    add_info = legislation_info["AdditionalInformation"]
                    if "previously" in add_info.lower():
                        prev_title = add_info.lower().replace("previously","").strip().replace(" ","")
                        bill.add_title(prev_title.upper())
                    elif add_info:
                        bill["additional_information"] = add_info

                if "WithDrawnDate" in legislation_info:
                    withdrawn_date = self.date_format(legislation_info["WithDrawnDate"])
                    withdrawn_by = legislation_info["WithdrawnBy"][0]["Name"].strip()
                    if withdrawn_by == "the Mayor":

                        bill.add_action("executive",
                                    "withdrawn",
                                    withdrawn_date,
                                    "bill:withdrawn")

                    elif "committee" in withdrawn_by.lower():
                        bill.add_action("upper",
                                    "withdrawn",
                                    withdrawn_date,
                                    "bill:withdrawn",
                                    committees=withdrawn_by)
                    else:
                        bill.add_action("upper",
                                    "withdrawn",
                                    withdrawn_date,
                                    "bill:withdrawn",
                                    legislators=withdrawn_by)


                #deal with actions involving the mayor
                mayor = bill_info["MayorReview"]
                if mayor != []:
                    mayor = mayor[0]

                    #in dc, mayor == governor because openstates schema
                    if "TransmittedDate" in mayor:
                        transmitted_date = self.date_format(mayor["TransmittedDate"])

                        bill.add_action("executive",
                                    "transmitted to mayor",
                                    transmitted_date,
                                    type = "governor:received")

                    if 'SignedDate' in mayor:
                        signed_date = self.date_format(mayor["SignedDate"])

                        bill.add_action("executive",
                                        "signed",
                                        signed_date,
                                        type="governor:signed")


                    elif 'ReturnedDate' in mayor: #if returned but not signed, it was vetoed
                        veto_date = self.date_format(mayor["ReturnedDate"])

                        bill.add_action("executive",
                                        "vetoed",
                                        veto_date,
                                        type="governor:vetoed")

                        if 'EnactedDate' in mayor: #if it was returned and enacted but not signed, there was a veto override
                            override_date = self.date_format(mayor["EnactedDate"])

                            bill.add_action("upper",
                                        "veto override",
                                        override_date,
                                        type="bill:veto_override:passed")

                    if 'AttachmentPath' in mayor:
                        #documents relating to the mayor's review
                        self.add_documents(mayor["AttachmentPath"],bill)

                congress = bill_info["CongressReview"]
                if len(congress) > 0:
                    congress = congress[0]
                    if "TransmittedDate" in congress:
                        transmitted_date = self.date_format(congress["TransmittedDate"])

                        bill.add_action("other",
                                    "Transmitted to Congress for review",
                                    transmitted_date)




                #deal with committee actions
                if "DateRead" in legislation_info:
                    date = legislation_info["DateRead"]
                elif "IntroductionDate" in legislation_info:
                    date = legislation_info["IntroductionDate"]
                else:
                    self.logger.warning("Crap, we can't find anything that looks like an action date. Skipping")
                    continue
                date = self.date_format(date)
                if "CommitteeReferral" in legislation_info:
                    committees = []
                    for committee in legislation_info["CommitteeReferral"]:
                        if committee["Name"].lower() == "retained by the council":
                            committees = []
                            break
                        else:
                            committees.append(committee["Name"])
                    if committees != []:
                        bill.add_action("upper",
                                    "referred to committee",
                                    date,
                                    committees=committees,
                                    type="committee:referred")

                if "CommitteeReferralComments" in legislation_info:
                    committees = []
                    for committee in legislation_info["CommitteeReferralComments"]:
                        committees.append(committee["Name"])
                    bill.add_action("upper",
                                    "comments from committee",
                                    date,
                                    committees=committees,
                                    type="other")

                #deal with random docs floating around
                docs = bill_info["OtherDocuments"]
                for d in docs:
                    if "AttachmentPath" in d:
                        self.add_documents(d["AttachmentPath"],bill)
                    else:
                        self.logger.warning("Document path missing from 'Other Documents'")

                if "MemoLink" in legislation_info:
                    self.add_documents(legislation_info["MemoLink"],bill)

                if "AttachmentPath" in legislation_info:
                    self.add_documents(legislation_info["AttachmentPath"],bill)


                #full council votes
                votes = bill_info["VotingSummary"]
                for vote in votes:
                    self.process_vote(vote, bill, member_ids)
     

                #deal with committee votes
                if "CommitteeMarkup" in bill_info:
                    committee_info = bill_info["CommitteeMarkup"]
                    if len(committee_info) > 0:
                        for committee_action in committee_info:
                            self.process_committee_vote(committee_action,bill)
                        if "AttachmentPath" in committee_info:
                            self.add_documents(vote["AttachmentPath"],bill,is_version)

                bill.add_source(bill_source_url)
                self.save_bill(bill)
            
            #get next page
            start_record += per_page
            params["request"]["iDisplayStart"] = start_record
            param_json = json.dumps(params)
            response = self.post(url,headers=headers,data=param_json)
            response = self.decode_json(response.json()["d"])
            data = response["aaData"]
Beispiel #7
0
    def process_bill(self, data):
        chamber = parse_psuedo_id(data['from_organization'])['classification']
        if chamber == 'legislature':
            chamber = 'upper'
        bill = Bill(data['legislative_session'],
                    chamber,
                    data['identifier'],
                    data['title'],
                    subjects=data['subject'],
                    type=data['classification'])
        if data['abstracts']:
            bill['summary'] = data['abstracts'][0]['abstract']
        bill.update(**data['extras'])

        for action in data['actions']:
            actor = parse_psuedo_id(
                action['organization_id'])['classification']
            legislators = []
            committees = []
            for rel in action['related_entities']:
                if rel['entity_type'] == 'organization':
                    committees.append(rel['name'])
                elif rel['entity_type'] == 'person':
                    legislators.append(rel['name'])
            bill.add_action(actor,
                            action['description'],
                            parse_date(action['date']),
                            type=_action_categories(action['classification']),
                            committees=committees,
                            legislators=legislators,
                            **action.get('extras', {}))

        for source in data['sources']:
            bill.add_source(source['url'])

        for sponsor in data['sponsorships']:
            bill.add_sponsor(
                sponsor['classification'],
                sponsor['name'],
            )

        for version in data['versions']:
            for link in version['links']:
                bill.add_version(version['note'],
                                 link['url'],
                                 mimetype=link['media_type'],
                                 date=parse_date(version['date']),
                                 **version.get('extras', {}))

        for doc in data['documents']:
            for link in doc['links']:
                bill.add_document(doc['note'],
                                  link['url'],
                                  mimetype=link['media_type'],
                                  date=parse_date(doc['date']),
                                  **doc.get('extras', {}))

        for title in data['other_titles']:
            bill.add_title(title['title'])

        for related in data['related_bills']:
            bill.add_companion(related['identifier'],
                               related['legislative_session'], chamber)

        bill['alternate_bill_ids'] = [
            oi['identifier'] for oi in data['other_identifiers']
        ]
        self.save_bill(bill)
Beispiel #8
0
    def scrape_bill(self, chamber, session, bill_id, short_title=None):
        """
        Scrapes documents, actions, vote counts and votes for
        bills from the 2009 session and above.
        """
        url = BILL_URL % (session, bill_id.replace(' ', ''))
        bill_page = self.get(url).text
        html = lxml.html.fromstring(bill_page)
        html.make_links_absolute('http://legislature.idaho.gov/legislation/%s/' % session)
        bill_tables = html.xpath('//table[contains(@class, "bill-table")]')
        title = bill_tables[1].text_content().strip()
        bill_type = get_bill_type(bill_id)
        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(url)
        bill['subjects'] = self._subjects[bill_id.replace(' ', '')]

        if short_title and bill['title'].lower() != short_title.lower():
            bill.add_title(short_title)

        # documents
        doc_links = html.xpath('//div[contains(@class,"pf-content")]//a')
        for link in doc_links:
            name = link.text_content().strip()
            href = link.get('href')
            if 'Engrossment' in name or 'Bill Text' in name:
                bill.add_version(name, href, mimetype='application/pdf')
            else:
                bill.add_document(name, href)

        def _split(string):
            return re.split(r"\w+[,|AND]\s+", string)

        # sponsors range from a committee to one legislator to a group of legs
        sponsor_lists = bill_tables[0].text_content().split('by')
        if len(sponsor_lists) > 1:
            for sponsors in sponsor_lists[1:]:
                if 'COMMITTEE' in sponsors.upper():
                    bill.add_sponsor('primary', sponsors.strip())
                else:
                    for person in _split(sponsors):
                        person = person.strip()
                        if person != "":
                            bill.add_sponsor('primary', person)

        actor = chamber
        last_date = None
        for row in bill_tables[2]:
            # lots of empty rows
            if len(row) == 1:
                continue
            _, date, action, _ = [x.text_content().strip() for x in row]

            if date:
                last_date = date
            else:
                date = last_date

            date = datetime.datetime.strptime(date + '/' + session[0:4],
                                              "%m/%d/%Y")
            if action.startswith('House'):
                actor = 'lower'
            elif action.startswith('Senate'):
                actor = 'upper'

            # votes
            if 'AYES' in action or 'NAYS' in action:
                vote = self.parse_vote(actor, date, row[2])
                vote.add_source(url)
                bill.add_vote(vote)
            # some td's text is seperated by br elements
            if len(row[2]):
                action = "".join(row[2].itertext())
            action = action.replace(u'\xa0', ' ').strip()
            atype = get_action(actor, action)
            bill.add_action(actor, action, date, type=atype)
            # after voice vote/roll call and some actions the bill is sent
            # 'to House' or 'to Senate'
            if 'to House' in action:
                actor = 'lower'
            elif 'to Senate' in action:
                actor = 'upper'
        self.save_bill(bill)
Beispiel #9
0
    def scrape(self, session, chambers):
        #get member id matching for vote parsing
        member_ids = self.get_member_ids()[session]
        per_page = 10  #seems like it gives me 10 no matter what.
        start_record = 0

        headers = {"Content-Type": "application/json"}
        url = "http://lims.dccouncil.us/_layouts/15/uploader/AdminProxy.aspx/GetPublicAdvancedSearch"
        bill_url = "http://lims.dccouncil.us/_layouts/15/uploader/AdminProxy.aspx/GetPublicData"
        params = {
            "request": {
                "sEcho": 2,
                "iColumns": 4,
                "sColumns": "",
                "iDisplayStart": 0,
                "iDisplayLength": per_page,
                "mDataProp_0": "ShortTitle",
                "mDataProp_1": "Title",
                "mDataProp_2": "LegislationCategories",
                "mDataProp_3": "Modified",
                "iSortCol_0": 0,
                "sSortDir_0": "asc",
                "iSortingCols": 0,
                "bSortable_0": "true",
                "bSortable_1": "true",
                "bSortable_2": "true",
                "bSortable_3": "true"
            },
            "criteria": {
                "Keyword": "",
                "Category": "",
                "SubCategoryId": "",
                "RequestOf": "",
                "CouncilPeriod": str(session),
                "Introducer": "",
                "CoSponsor": "",
                "CommitteeReferral": "",
                "CommitteeReferralComments": "",
                "StartDate": "",
                "EndDate": "",
                "QueryLimit": 100,
                "FilterType": "",
                "Phases": "",
                "LegislationStatus": "0",
                "IncludeDocumentSearch": "false"
            }
        }
        param_json = json.dumps(params)
        response = self.post(url, headers=headers, data=param_json)
        #the response is a terrible string-of-nested-json-strings. Yuck.
        response = self.decode_json(response.json()["d"])
        data = response["aaData"]

        global bill_versions

        while len(data) > 0:

            for bill in data:

                bill_versions = [
                ]  #sometimes they're in there more than once, so we'll keep track

                bill_id = bill["Title"]
                if bill_id.startswith("AG"):
                    #actually an agenda, skip
                    continue
                bill_params = {"legislationId": bill_id}
                bill_info = self.post(bill_url,
                                      headers=headers,
                                      data=json.dumps(bill_params))
                bill_info = self.decode_json(bill_info.json()["d"])["data"]
                bill_source_url = "http://lims.dccouncil.us/Legislation/" + bill_id

                legislation_info = bill_info["Legislation"][0]
                title = legislation_info["ShortTitle"]

                if bill_id.startswith("R") or bill_id.startswith("CER"):
                    bill_type = "resolution"
                else:
                    bill_type = "bill"

                #dc has no chambers. calling it all upper
                bill = Bill(session, "upper", bill_id, title, type=bill_type)

                #sponsors and cosponsors
                if "Introducer" in legislation_info:
                    introducers = legislation_info["Introducer"]
                    intro_date = self.date_format(
                        legislation_info["IntroductionDate"])
                    bill.add_action("upper",
                                    "Introduced",
                                    intro_date,
                                    type="bill:introduced")
                else:
                    #sometimes there are introducers, sometimes not.
                    # Set Introducers to empty array to avoid downstream breakage, but log bills without introducers
                    self.logger.warning("No Introducer: {0} {1}: {2}".format(
                        bill['chamber'], bill['session'], bill['bill_id']))
                    introducers = []

                try:
                    #sometimes there are cosponsors, sometimes not.
                    cosponsors = legislation_info["CoSponsor"]
                except KeyError:
                    cosponsors = []

                for i in introducers:
                    sponsor_name = i["Name"]
                    #they messed up Phil Mendelson's name
                    if sponsor_name == "Phil Pmendelson":
                        sponsor_name = "Phil Mendelson"
                    bill.add_sponsor(name=sponsor_name, type="primary")
                for s in cosponsors:
                    sponsor_name = s["Name"]
                    if sponsor_name == "Phil Pmendelson":
                        sponsor_name = "Phil Mendelson"
                    bill.add_sponsor(name=sponsor_name, type="cosponsor")

                #if it's become law, add the law number as an alternate title
                if "LawNumber" in legislation_info:
                    law_num = legislation_info["LawNumber"]
                    if law_num:
                        bill.add_title(law_num)

                #also sometimes it's got an act number
                if "ActNumber" in legislation_info:
                    act_num = legislation_info["ActNumber"]
                    if act_num:
                        bill.add_title(act_num)

                #sometimes AdditionalInformation has a previous bill name
                if "AdditionalInformation" in legislation_info:
                    add_info = legislation_info["AdditionalInformation"]
                    if "previously" in add_info.lower():
                        prev_title = add_info.lower().replace(
                            "previously", "").strip().replace(" ", "")
                        bill.add_title(prev_title.upper())
                    elif add_info:
                        bill["additional_information"] = add_info

                if "WithDrawnDate" in legislation_info:
                    withdrawn_date = self.date_format(
                        legislation_info["WithDrawnDate"])
                    withdrawn_by = legislation_info["WithdrawnBy"][0][
                        "Name"].strip()
                    if withdrawn_by == "the Mayor":

                        bill.add_action("executive", "withdrawn",
                                        withdrawn_date, "bill:withdrawn")

                    elif "committee" in withdrawn_by.lower():
                        bill.add_action("upper",
                                        "withdrawn",
                                        withdrawn_date,
                                        "bill:withdrawn",
                                        committees=withdrawn_by)
                    else:
                        bill.add_action("upper",
                                        "withdrawn",
                                        withdrawn_date,
                                        "bill:withdrawn",
                                        legislators=withdrawn_by)

                #deal with actions involving the mayor
                mayor = bill_info["MayorReview"]
                if mayor != []:
                    mayor = mayor[0]

                    #in dc, mayor == governor because openstates schema
                    if "TransmittedDate" in mayor:
                        transmitted_date = self.date_format(
                            mayor["TransmittedDate"])

                        bill.add_action("executive",
                                        "transmitted to mayor",
                                        transmitted_date,
                                        type="governor:received")

                    if 'SignedDate' in mayor:
                        signed_date = self.date_format(mayor["SignedDate"])

                        bill.add_action("executive",
                                        "signed",
                                        signed_date,
                                        type="governor:signed")

                    elif 'ReturnedDate' in mayor:  #if returned but not signed, it was vetoed
                        veto_date = self.date_format(mayor["ReturnedDate"])

                        bill.add_action("executive",
                                        "vetoed",
                                        veto_date,
                                        type="governor:vetoed")

                        if 'EnactedDate' in mayor:  #if it was returned and enacted but not signed, there was a veto override
                            override_date = self.date_format(
                                mayor["EnactedDate"])

                            bill.add_action("upper",
                                            "veto override",
                                            override_date,
                                            type="bill:veto_override:passed")

                    if 'AttachmentPath' in mayor:
                        #documents relating to the mayor's review
                        self.add_documents(mayor["AttachmentPath"], bill)

                congress = bill_info["CongressReview"]
                if len(congress) > 0:
                    congress = congress[0]
                    if "TransmittedDate" in congress:
                        transmitted_date = self.date_format(
                            congress["TransmittedDate"])

                        bill.add_action("other",
                                        "Transmitted to Congress for review",
                                        transmitted_date)

                #deal with committee actions
                if "DateRead" in legislation_info:
                    date = legislation_info["DateRead"]
                elif "IntroductionDate" in legislation_info:
                    date = legislation_info["IntroductionDate"]
                else:
                    self.logger.warning(
                        "Crap, we can't find anything that looks like an action date. Skipping"
                    )
                    continue
                date = self.date_format(date)
                if "CommitteeReferral" in legislation_info:
                    committees = []
                    for committee in legislation_info["CommitteeReferral"]:
                        if committee["Name"].lower(
                        ) == "retained by the council":
                            committees = []
                            break
                        else:
                            committees.append(committee["Name"])
                    if committees != []:
                        bill.add_action("upper",
                                        "referred to committee",
                                        date,
                                        committees=committees,
                                        type="committee:referred")

                if "CommitteeReferralComments" in legislation_info:
                    committees = []
                    for committee in legislation_info[
                            "CommitteeReferralComments"]:
                        committees.append(committee["Name"])
                    bill.add_action("upper",
                                    "comments from committee",
                                    date,
                                    committees=committees,
                                    type="other")

                #deal with random docs floating around
                docs = bill_info["OtherDocuments"]
                for d in docs:
                    if "AttachmentPath" in d:
                        self.add_documents(d["AttachmentPath"], bill)
                    else:
                        self.logger.warning(
                            "Document path missing from 'Other Documents'")

                if "MemoLink" in legislation_info:
                    self.add_documents(legislation_info["MemoLink"], bill)

                if "AttachmentPath" in legislation_info:
                    self.add_documents(legislation_info["AttachmentPath"],
                                       bill)

                #full council votes
                votes = bill_info["VotingSummary"]
                for vote in votes:
                    self.process_vote(vote, bill, member_ids)

                #deal with committee votes
                if "CommitteeMarkup" in bill_info:
                    committee_info = bill_info["CommitteeMarkup"]
                    if len(committee_info) > 0:
                        for committee_action in committee_info:
                            self.process_committee_vote(committee_action, bill)
                        if "AttachmentPath" in committee_info:
                            self.add_documents(vote["AttachmentPath"], bill,
                                               is_version)

                bill.add_source(bill_source_url)
                self.save_bill(bill)

            #get next page
            start_record += per_page
            params["request"]["iDisplayStart"] = start_record
            param_json = json.dumps(params)
            response = self.post(url, headers=headers, data=param_json)
            response = self.decode_json(response.json()["d"])
            data = response["aaData"]
Beispiel #10
0
    def scrape(self, chamber, session):
        # check for abiword
        if os.system('which abiword') != 0:
            raise ScrapeError('abiword is required for KS scraping')

        chamber_name = 'Senate' if chamber == 'upper' else 'House'
        chamber_letter = chamber_name[0]
        # perhaps we should save this data so we can make one request for both?
        with self.urlopen(ksapi.url + 'bill_status/') as bill_request:
            bill_request_json = json.loads(bill_request)
            bills = bill_request_json['content']
            for bill_data in bills:

                bill_id = bill_data['BILLNO']

                # filter other chambers
                if not bill_id.startswith(chamber_letter):
                    continue

                if 'CR' in bill_id:
                    btype = 'concurrent resolution'
                elif 'R' in bill_id:
                    btype = 'resolution'
                elif 'B' in bill_id:
                    btype = 'bill'

                title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE']

                # main
                bill = Bill(session,
                            chamber,
                            bill_id,
                            title,
                            type=btype,
                            status=bill_data['STATUS'])
                bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower())

                if (bill_data['LONGTITLE']
                        and bill_data['LONGTITLE'] != bill['title']):
                    bill.add_title(bill_data['LONGTITLE'])

                for sponsor in bill_data['SPONSOR_NAMES']:
                    stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1
                             else 'cosponsor')
                    bill.add_sponsor(stype, sponsor)

                # history is backwards
                for event in reversed(bill_data['HISTORY']):

                    actor = ('upper'
                             if event['chamber'] == 'Senate' else 'lower')

                    date = datetime.datetime.strptime(
                        event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S")
                    # append committee names if present
                    if 'committee_names' in event:
                        action = (event['status'] + ' ' +
                                  ' and '.join(event['committee_names']))
                    else:
                        action = event['status']

                    if event['action_code'] not in ksapi.action_codes:
                        self.warning(
                            'unknown action code on %s: %s %s' %
                            (bill_id, event['action_code'], event['status']))
                        atype = 'other'
                    else:
                        atype = ksapi.action_codes[event['action_code']]
                    bill.add_action(actor, action, date, type=atype)

                try:
                    self.scrape_html(bill)
                except scrapelib.HTTPError as e:
                    self.warning('unable to fetch HTML for bill {0}'.format(
                        bill['bill_id']))
                self.save_bill(bill)
Beispiel #11
0
    def scrape(self, chamber, session):
        # check for abiword
        if os.system('which abiword') != 0:
            raise ScrapeError('abiword is required for KS scraping')

        chamber_name = 'Senate' if chamber == 'upper' else 'House'
        chamber_letter = chamber_name[0]
        # perhaps we should save this data so we can make one request for both?
        with self.urlopen(ksapi.url + 'bill_status/') as bill_request:
            bill_request_json = json.loads(bill_request)
            bills = bill_request_json['content']
            for bill_data in bills:

                bill_id = bill_data['BILLNO']

                # filter other chambers
                if not bill_id.startswith(chamber_letter):
                    continue

                if 'CR' in bill_id:
                    btype = 'concurrent resolution'
                elif 'R' in bill_id:
                    btype = 'resolution'
                elif 'B' in bill_id:
                    btype = 'bill'

                title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE']

                # main
                bill = Bill(session, chamber, bill_id, title,
                            type=btype, status=bill_data['STATUS'])
                bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower())

                if (bill_data['LONGTITLE'] and
                    bill_data['LONGTITLE'] != bill['title']):
                    bill.add_title(bill_data['LONGTITLE'])

                for sponsor in bill_data['SPONSOR_NAMES']:
                    stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1
                             else 'cosponsor')
                    bill.add_sponsor(stype, sponsor)

                # history is backwards
                for event in reversed(bill_data['HISTORY']):

                    actor = ('upper' if event['chamber'] == 'Senate'
                             else 'lower')

                    date = datetime.datetime.strptime(event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S")
                    # append committee names if present
                    if 'committee_names' in event:
                        action = (event['status'] + ' ' +
                                  ' and '.join(event['committee_names']))
                    else:
                        action = event['status']

                    if event['action_code'] not in ksapi.action_codes:
                        self.warning('unknown action code on %s: %s %s' %
                                     (bill_id, event['action_code'],
                                      event['status']))
                        atype = 'other'
                    else:
                        atype = ksapi.action_codes[event['action_code']]
                    bill.add_action(actor, action, date, type=atype)

                try:
                    self.scrape_html(bill)
                except scrapelib.HTTPError as e:
                    self.warning('unable to fetch HTML for bill {0}'.format(
                        bill['bill_id']))
                self.save_bill(bill)
Beispiel #12
0
    def scrape_bill(self, chamber, session, bill_id, short_title=None):
        """
        Scrapes documents, actions, vote counts and votes for
        bills from the 2009 session and above.
        """
        url = BILL_URL % (session, bill_id.replace(' ', ''))
        with self.urlopen(url) as bill_page:
            html = lxml.html.fromstring(bill_page)
            html.make_links_absolute(
                'http://legislature.idaho.gov/legislation/%s/' % session)
            bill_tables = html.xpath('./body/table/tr/td[2]')[0].xpath(
                './/table')
            title = bill_tables[1].text_content().strip()
            bill_type = get_bill_type(bill_id)
            bill = Bill(session, chamber, bill_id, title, type=bill_type)
            bill.add_source(url)
            bill['subjects'] = self._subjects[bill_id.replace(' ', '')]

            if short_title and bill['title'].lower() != short_title.lower():
                bill.add_title(short_title)

            # documents
            doc_links = html.xpath('//span/a')
            for link in doc_links:
                name = link.text_content().strip()
                href = link.get('href')
                if 'Engrossment' in name or 'Bill Text' in name:
                    bill.add_version(name, href)
                else:
                    bill.add_document(name, href)

            # sponsors range from a committee to one legislator to a group of legs
            sponsor_lists = bill_tables[0].text_content().split('by')
            if len(sponsor_lists) > 1:
                for sponsors in sponsor_lists[1:]:
                    for person in sponsors.split(','):
                        bill.add_sponsor('primary', person)

            actor = chamber
            last_date = None
            for row in bill_tables[2]:
                # lots of empty rows
                if len(row) == 1:
                    continue
                _, date, action, _ = [x.text_content().strip() for x in row]

                if date:
                    last_date = date
                else:
                    date = last_date

                date = datetime.datetime.strptime(date + '/' + session[0:4],
                                                  "%m/%d/%Y")
                if action.startswith('House'):
                    actor = 'lower'
                elif action.startswith('Senate'):
                    actor = 'upper'

                # votes
                if 'AYES' in action or 'NAYS' in action:
                    vote = self.parse_vote(actor, date, row[2])
                    vote.add_source(url)
                    bill.add_vote(vote)
                # some td's text is seperated by br elements
                if len(row[2]):
                    action = "".join(row[2].itertext())
                action = action.replace(u'\xa0', ' ').strip()
                atype = get_action(actor, action)
                bill.add_action(actor, action, date, type=atype)
                # after voice vote/roll call and some actions the bill is sent
                # 'to House' or 'to Senate'
                if 'to House' in action:
                    actor = 'lower'
                elif 'to Senate' in action:
                    actor = 'upper'
            self.save_bill(bill)
Beispiel #13
0
    def scrape_current(self, chamber, term):
        chamber_name = 'Senate' if chamber == 'upper' else 'House'
        chamber_letter = chamber_name[0]
        # perhaps we should save this data so we can make one request for both?
        with self.urlopen(ksapi.url + 'bill_status/') as bill_request:
            bill_request_json = json.loads(bill_request)
            bills = bill_request_json['content']
            for bill_data in bills:

                bill_id = bill_data['BILLNO']

                # filter other chambers
                if not bill_id.startswith(chamber_letter):
                    continue

                if 'CR' in bill_id:
                    btype = 'concurrent resolution'
                elif 'R' in bill_id:
                    btype = 'resolution'
                elif 'B' in bill_id:
                    btype = 'bill'

                # main
                bill = Bill(term,
                            chamber,
                            bill_id,
                            bill_data['SHORTTITLE'],
                            type=btype,
                            status=bill_data['STATUS'])
                bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower())

                if bill_data['LONGTITLE']:
                    bill.add_title(bill_data['LONGTITLE'])

                for sponsor in bill_data['SPONSOR_NAMES']:
                    stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1
                             else 'cosponsor')
                    bill.add_sponsor(stype, sponsor)

                # history is backwards
                for event in reversed(bill_data['HISTORY']):

                    actor = ('upper'
                             if event['chamber'] == 'Senate' else 'lower')

                    date = datetime.datetime.strptime(
                        event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S")
                    # append committee names if present
                    if 'committee_names' in event:
                        action = (event['status'] + ' ' +
                                  ' and '.join(event['committee_names']))
                    else:
                        action = event['status']

                    if event['action_code'] not in ksapi.action_codes:
                        self.warning(
                            'unknown action code on %s: %s %s' %
                            (bill_id, event['action_code'], event['status']))
                        atype = 'other'
                    else:
                        atype = ksapi.action_codes[event['action_code']]
                    bill.add_action(actor, action, date, type=atype)

                self.scrape_html(bill)
                self.save_bill(bill)
Beispiel #14
0
    def scrape(self, chamber, session):
        # check for abiword
        if os.system("which abiword") != 0:
            raise ScrapeError("abiword is required for KS scraping")

        chamber_name = "Senate" if chamber == "upper" else "House"
        chamber_letter = chamber_name[0]
        # perhaps we should save this data so we can make one request for both?
        with self.urlopen(ksapi.url + "bill_status/") as bill_request:
            bill_request_json = json.loads(bill_request)
            bills = bill_request_json["content"]
            for bill_data in bills:

                bill_id = bill_data["BILLNO"]

                # filter other chambers
                if not bill_id.startswith(chamber_letter):
                    continue

                if "CR" in bill_id:
                    btype = "concurrent resolution"
                elif "R" in bill_id:
                    btype = "resolution"
                elif "B" in bill_id:
                    btype = "bill"

                title = bill_data["SHORTTITLE"] or bill_data["LONGTITLE"]

                # main
                bill = Bill(session, chamber, bill_id, title, type=btype, status=bill_data["STATUS"])
                bill.add_source(ksapi.url + "bill_status/" + bill_id.lower())

                if bill_data["LONGTITLE"] and bill_data["LONGTITLE"] != bill["title"]:
                    bill.add_title(bill_data["LONGTITLE"])

                for sponsor in bill_data["SPONSOR_NAMES"]:
                    stype = "primary" if len(bill_data["SPONSOR_NAMES"]) == 1 else "cosponsor"
                    bill.add_sponsor(stype, sponsor)

                # history is backwards
                for event in reversed(bill_data["HISTORY"]):

                    actor = "upper" if event["chamber"] == "Senate" else "lower"

                    date = datetime.datetime.strptime(event["occurred_datetime"], "%Y-%m-%dT%H:%M:%S")
                    # append committee names if present
                    if "committee_names" in event:
                        action = event["status"] + " " + " and ".join(event["committee_names"])
                    else:
                        action = event["status"]

                    if event["action_code"] not in ksapi.action_codes:
                        self.warning(
                            "unknown action code on %s: %s %s" % (bill_id, event["action_code"], event["status"])
                        )
                        atype = "other"
                    else:
                        atype = ksapi.action_codes[event["action_code"]]
                    bill.add_action(actor, action, date, type=atype)

                try:
                    self.scrape_html(bill)
                except scrapelib.HTTPError as e:
                    self.warning("unable to fetch HTML for bill {0}".format(bill["bill_id"]))
                self.save_bill(bill)
Beispiel #15
0
    def _scrape_data(self, chamber, session, jsons):
        chamber_name = 'Senate' if chamber == 'upper' else 'House'
        chamber_letter = chamber_name[0]
        bill_request_json = json.loads(jsons)
        bills = bill_request_json['content']
        for bill_data in bills:
            bill_id = bill_data['BILLNO']
            if not(self.get_filter_bill_id() is False):
                if not bill_id == self.get_filter_bill_id():
                    _log.debug("Skipping bill_id %s != %s" % (
                        bill_id, self.get_filter_bill_id()))
                    continue
                else:
                    _log.debug(
                        "Matched bill_id %s == %s" % (
                            bill_id,
                            self.get_filter_bill_id()))
            else:
                _log.debug(
                    "no check bill_id %s and %s" % (
                        bill_id,
                        self.get_filter_bill_id())
                )

            # filter other chambers
            if not bill_id.startswith(chamber_letter):
                continue

            if 'CR' in bill_id:
                btype = 'concurrent resolution'
            elif 'R' in bill_id:
                btype = 'resolution'
            elif 'B' in bill_id:
                btype = 'bill'

            title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE']

            # main
            bill = Bill(session, chamber, bill_id, title,
                        type=btype, status=bill_data['STATUS'])
            bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower())

            if (
                    bill_data['LONGTITLE'] and
                    bill_data['LONGTITLE'] != bill['title']):
                bill.add_title(bill_data['LONGTITLE'])

            for sponsor in bill_data['SPONSOR_NAMES']:
                stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1
                         else 'cosponsor')
                bill.add_sponsor(stype, sponsor)

            # history is backwards
            for event in reversed(bill_data['HISTORY']):

                actor = ('upper' if event['chamber'] == 'Senate'
                         else 'lower')

                date = datetime.datetime.strptime(
                    event['occurred_datetime'],
                    "%Y-%m-%dT%H:%M:%S")
                # append committee names if present
                if 'committee_names' in event:
                    action = (event['status'] + ' ' +
                              ' and '.join(event['committee_names']))
                else:
                    action = event['status']

                if event['action_code'] not in ksapi.action_codes:
                    self.warning('unknown action code on %s: %s %s' %
                                 (bill_id, event['action_code'],
                                  event['status']))
                    atype = 'other'
                else:
                    atype = ksapi.action_codes[event['action_code']]
                bill.add_action(actor, action, date, type=atype)

            try:
                self.scrape_html(bill)
            except scrapelib.HTTPError as e:
                self.debug(e)
                self.debug(bill)
                self.warning('unable to fetch HTML for bill {0}'.format(
                    bill['bill_id']))
            self.save_bill(bill)