Exemple #1
0
    def scrape_assem_bills(self, chamber, insert, session, year):

        doc_type = {1: 'bill', 3: 'resolution', 5: 'concurrent resolution',
                    6: 'joint resolution'}
        for docnum, bill_type in doc_type.iteritems():
            parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum)
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count = count + 1
                page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)
                with self.urlopen(page_path) as page:
                    page = page.decode("utf8").replace(u"\xa0", " ")
                    root = lxml.html.fromstring(page)

                    bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)')
                    title = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[5]/td)')

                    if insert.find('Special') != -1:
                        session = insert
                    bill = Bill(session, chamber, bill_id, title,
                                type=bill_type)
                    bill_text = root.xpath("string(/html/body/div[@id='content']/table[6]/tr/td[2]/a/@href)")
                    text_url = "http://www.leg.state.nv.us" + bill_text
                    bill.add_version("Bill Text", text_url)


                    primary, secondary = self.scrape_sponsors(page)
                    
                    if primary[0] == 'By:':
                        primary.pop(0)
                        
                        if primary[0] == 'ElectionsProceduresEthicsand':
                            primary[0] = 'Elections Procedures Ethics and'

                        full_name = ''
                        for part_name in primary:
                            full_name = full_name + part_name + " "
                        bill.add_sponsor('primary', full_name)
                    else:
                        for leg in primary:
                            bill.add_sponsor('primary', leg)
                    for leg in secondary:
                        bill.add_sponsor('cosponsor', leg)

                    minutes_count = 2
                    for mr in root.xpath('//table[4]/tr/td[3]/a'):
                        minutes =  mr.xpath("string(@href)")
                        minutes_url = "http://www.leg.state.nv.us" + minutes
                        minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
                        minutes_date = mr.xpath(minutes_date_path).split()
                        minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes"
                        bill.add_document(minutes_date, minutes_url)
                        minutes_count = minutes_count + 1


                    self.scrape_actions(root, bill, "lower")
                    self.scrape_votes(page, bill, insert, year)
                    bill.add_source(page_path)
                    self.save_bill(bill)
Exemple #2
0
    def scrape_bill(self, session, chamber, bill_id, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            title = page.xpath("//br")[8].tail
            if not title:
                return
            title = title.strip()

            bill = Bill(session, chamber, bill_id, title)
            bill.add_source(url)

            action_link = page.xpath("//a[contains(@href, 'getActions')]")[0]
            self.scrape_actions(bill, action_link.attrib['href'])

            version_path = "//a[contains(., '%s')]"
            for version_type in ('Introduced Bill', 'House Bill',
                                 'Senate Bill', 'Engrossed Bill',
                                 'Enrolled Act'):
                path = version_path % version_type
                links = page.xpath(path)
                if links:
                    bill.add_version(version_type, links[0].attrib['href'])

            for doc_link in page.xpath("//a[contains(@href, 'FISCAL')]"):
                num = doc_link.text.strip().split("(")[0]
                bill.add_document("Fiscal Impact Statement #%s" % num,
                                  doc_link.attrib['href'])

            bill['subjects'] = self.subjects[bill_id]

            self.save_bill(bill)
Exemple #3
0
    def scrape(self, chamber, session):
        self.validate_session(session)

        if chamber == 'upper':
            other_chamber = 'lower'
            bill_id = 'SB 1'
        else:
            other_chamber = 'upper'
            bill_id = 'HB 1'

        b1 = Bill(session, chamber, bill_id, 'A super bill')
        b1.add_source('http://example.com/')
        b1.add_version('As Introduced', 'http://example.com/SB1.html')
        b1.add_document('Google', 'http://google.com')
        b1.add_sponsor('primary', 'Bob Smith')
        b1.add_sponsor('secondary', 'Johnson, Sally')

        d1 = datetime.datetime.strptime('1/29/2010', '%m/%d/%Y')
        v1 = Vote('upper', d1, 'Final passage', True, 2, 0, 0)
        v1.yes('Smith')
        v1.yes('Johnson')

        d2 = datetime.datetime.strptime('1/30/2010', '%m/%d/%Y')
        v2 = Vote('lower', d2, 'Final passage', False, 0, 1, 1)
        v2.no('Bob Smith')
        v2.other('S. Johnson')

        b1.add_vote(v1)
        b1.add_vote(v2)

        b1.add_action(chamber, 'introduced', d1)
        b1.add_action(chamber, 'read first time', d2)
        b1.add_action(other_chamber, 'introduced', d2)

        self.save_bill(b1)
Exemple #4
0
    def scrape_year(self, year, chamber):    
        
        sep = '<h1>House</h1>'
            
        if chamber == 'upper':
            after = False
            reg = '[5-9]'
        else:
            after = True
            reg = '[1-4]'
                
        with self.lxml_context("http://apps.leg.wa.gov/billinfo/dailystatus.aspx?year=" + str(year), sep, after) as page:
            for element, attribute, link, pos in page.iterlinks():
                if re.search("bill=" + reg + "[0-9]{3}", link) != None:
                    bill_page_url = "http://apps.leg.wa.gov/billinfo/" + link
                    with self.lxml_context(bill_page_url) as bill_page:
                        raw_title = bill_page.cssselect('title')
                        split_title = string.split(raw_title[0].text_content(), ' ')
                        bill_id = split_title[0] + ' ' + split_title[1]
                        bill_id = bill_id.strip()
                        session = split_title[3].strip()

                        title_element = bill_page.get_element_by_id("ctl00_ContentPlaceHolder1_lblSubTitle")
                        title = title_element.text_content()

                        bill = Bill(session, chamber, bill_id, title)
                        bill.add_source(bill_page_url)
                
                        self.scrape_actions(bill_page, bill)
                
                        for element, attribute, link, pos in bill_page.iterlinks():
                            if re.search("billdocs", link) != None:
                                if re.search("Amendments", link) != None:
                                    bill.add_document("Amendment: " + element.text_content(), link)    
                                elif re.search("Bills", link) != None:
                                    bill.add_version(element.text_content(), link) 
                                else:
                                    bill.add_document(element.text_content(), link)
                            elif re.search("senators|representatives", link) != None:
                                with self.lxml_context(link) as senator_page:
                                    try:
                                        name_tuple = self.scrape_legislator_name(senator_page)
                                        bill.add_sponsor('primary', name_tuple[0])
                                    except:
                                        pass
                            elif re.search("ShowRollCall", link) != None:
                                match = re.search("([0-9]+,[0-9]+)", link)
                                match = match.group(0)
                                match = match.split(',')
                                id1 = match[0]
                                id2 = match[1]
                                url = "http://flooractivityext.leg.wa.gov/rollcall.aspx?id=" + id1 + "&bienId=" +id2
                                with self.lxml_context(url) as vote_page:
                                    self.scrape_votes(vote_page, bill, url)
                                    
                        self.save_bill(bill)
Exemple #5
0
    def parse_bill(self, chamber, session, bill_id, bill_info_url):
        with self.urlopen(bill_info_url) as bill_info_data:
            bill_info = self.soup_parser(bill_info_data)
            version_url = '%s/bill.doc' % bill_id
            version_link = bill_info.find(href=version_url)

            if not version_link:
                # This bill was withdrawn
                return

            bill_title = version_link.findNext('p').contents[0].strip()

            bill = Bill(session, chamber, bill_id, bill_title)
            bill.add_version("Most Recent Version",
                             session_url(session) + version_url)
            bill.add_source(bill_info_url)

            sponsor_links = bill_info.findAll(href=re.compile(
                    'legislator/[SH]\d+\.htm'))

            for sponsor_link in sponsor_links:
                bill.add_sponsor('primary', sponsor_link.contents[0].strip())

            action_p = version_link.findAllNext('p')[-1]
            for action in action_p.findAll(text=True):
                action = action.strip()
                if (not action or action == 'last action' or
                    'Prefiled' in action):
                    continue

                action_date = action.split('-')[0]
                action_date = dt.datetime.strptime(action_date, '%b %d')
                # Fix:
                action_date = action_date.replace(
                    year=int('20' + session[2:4]))

                action = '-'.join(action.split('-')[1:])

                if action.endswith('House') or action.endswith('(H)'):
                    actor = 'lower'
                elif action.endswith('Senate') or action.endswith('(S)'):
                    actor = 'upper'
                else:
                    actor = chamber

                bill.add_action(actor, action, action_date)

            vote_link = bill_info.find(href=re.compile('.*/vote_history.pdf'))
            if vote_link:
                bill.add_document(
                    'vote_history.pdf',
                    bill_info_url.replace('.htm', '') + "/vote_history.pdf")

            self.save_bill(bill)
Exemple #6
0
    def scrape(self, chamber, session):
        self.validate_session(session)

        if chamber == 'upper':
            bill_no = 1
            abbr = 'SB'
        else:
            bill_no = 4001
            abbr = 'HB'
        while True:
            bill_page = self.scrape_bill(session, abbr, bill_no)
            bill_page = BeautifulSoup(bill_page)
            # if we can't find a page, we must be done. This is a healthy thing.
            if bill_page == None: return
            title = ''.join(self.flatten(bill_page.findAll(id='frg_billstatus_ObjectSubject')[0]))
            title = title.replace('\n','').replace('\r','')
            bill_id = "%s %d" % (abbr, bill_no)

            the_bill = Bill(session, chamber, bill_id, title)

            #sponsors
            first = 0
            for name in bill_page.findAll(id='frg_billstatus_SponsorList')[0].findAll('a'):
                the_bill.add_sponsor(['primary', 'cosponsor'][first], name.string)
                first = 1

            #versions
            for doc in bill_page.findAll(id='frg_billstatus_DocumentGridTable')[0].findAll('tr'):
                r = self.parse_doc(the_bill, doc)
                if r: the_bill.add_version(*r)

            #documents
            if 'frg_billstatus_HlaTable' in str(bill_page):
                for doc in bill_page.findAll(id='frg_billstatus_HlaTable')[0].findAll('tr'):
                    r = self.parse_doc(the_bill, doc)
                    if r: the_bill.add_document(*r)
            if 'frg_billstatus_SfaSection' in str(bill_page):
                for doc in bill_page.findAll(id='frg_billstatus_SfaSection')[0].findAll('tr'):
                    r = self.parse_doc(the_bill, doc)
                    if r: the_bill.add_document(*r)

            self.parse_actions(the_bill, bill_page.findAll(id='frg_billstatus_HistoriesGridView')[0])
            self.save_bill(the_bill)
            bill_no = bill_no + 1
        pass
Exemple #7
0
    def scrape_bill(self, chamber, session, bill_number, ga_num):
        bill_url = self.urls['info'] % (bill_number, ga_num)

        with self.urlopen(bill_url) as page:
            page = lxml.html.fromstring(page)
            title = page.xpath("//span[@id='lblAbstract']")[0].text
            
            bill = Bill(session, chamber, bill_number, title)
            bill.add_source(bill_url)
            
            # Primary Sponsor
            sponsor = page.xpath("//span[@id='lblBillSponsor']")[0].text_content().split("by")[-1]
            sponsor = sponsor.replace('*','').strip()
            bill.add_sponsor('primary',sponsor)
            
            # Co-sponsors unavailable for scraping (loaded into page via AJAX)
            
            # Full summary doc
            summary = page.xpath("//span[@id='lblBillSponsor']/a")[0]
            bill.add_document('Full summary', summary.get('href'))
            
            # Actions
            tables = page.xpath("//table[@id='tabHistoryAmendments_tabHistory_gvBillActionHistory']")
            actions_table = tables[0]
            action_rows = actions_table.xpath("tr[position()>1]")
            for ar in action_rows:
                action_taken = ar.xpath("td")[0].text
                action_date = datetime.datetime.strptime(ar.xpath("td")[1].text.strip(), '%m/%d/%Y')
                bill.add_action(chamber, action_taken, action_date)

            votes_link = page.xpath("//span[@id='lblBillVotes']/a")
            if(len(votes_link) > 0):
                votes_link = votes_link[0].get('href')
                bill = self.scrape_votes(bill, sponsor, 'http://wapp.capitol.tn.gov/apps/Billinfo/%s' % (votes_link,))

            self.save_bill(bill)
Exemple #8
0
    def scrape_bill_pages(self, session, year_abr):
        """ assemble information on a bill from a number of DBF files
        """

        #Main Bill information
        main_bill_url, main_bill_db = self.get_dbf(year_abr, 'MAINBILL')

        # keep a dictionary of bills (mapping bill_id to Bill obj)
        bill_dict = {}

        for rec in main_bill_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["synopsis"]
            if bill_type[0] == 'A':
                chamber = "lower"
            else:
                chamber = "upper"

            bill = Bill(str(session), chamber, bill_id, title,
                        type=self._bill_types[bill_type[1:]])
            bill.add_source(main_bill_url)
            bill_dict[bill_id] = bill

        #Sponsors
        bill_sponsors_url, bill_sponsors_db = self.get_dbf(year_abr, 'BILLSPON')

        for rec in bill_sponsors_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            name = rec["sponsor"]
            sponsor_type = rec["type"]
            if sponsor_type == 'P':
                sponsor_type = "Primary"
            else:
                sponsor_type = "Co-sponsor"
            bill.add_sponsor(sponsor_type, name)


        #Documents
        bill_document_url, bill_document_db = self.get_dbf(year_abr, 'BILLWP')

        #print bill_document_db[2]
        for rec in bill_document_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            document = rec["document"]
            document = document.split('\\')
            document = document[-2] + "/" + document[-1]
            year = str(year_abr) + str((year_abr + 1))

            #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document)
            htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (year_abr,
                document.replace('.DOC', '.HTM'))

            # name document based _doctype
            doc_name = self._doctypes[rec['doctype']]
            if rec['comment']:
                doc_name += ' ' + rec['comment']

            if rec['doctype'] in self._version_types:
                bill.add_version(doc_name, htm_url)
            else:
                bill.add_document(doc_name, htm_url)

        #Senate Votes
        file1 = 'A' + str(year_abr)
        file2 = 'A' + str(year_abr + 1)
        file3 = 'S' + str(year_abr)
        file4 = 'S' + str(year_abr + 1)
        if str(year_abr) != '2010':
            vote_info_list = [file1, file2, file3, file4]
        else:
            vote_info_list = [file1, file3]
        for bill_vote_file in vote_info_list:
            s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % bill_vote_file
            s_vote_zip, resp = self.urlretrieve(s_vote_url)
            zipedfile = zipfile.ZipFile(s_vote_zip)
            vfile = "%s.txt" % bill_vote_file
            vote_file = zipedfile.open(vfile, 'U')
            vdict_file = csv.DictReader(vote_file)

            votes = {}
            if bill_vote_file[0] == "A":
                chamber = "lower"
            else:
                chamber = "upper"

            for rec in vdict_file:
                bill_id = rec["Bill"]
                bill_id = bill_id.strip()
                leg = rec["Full_Name"]

                date = rec["Session_Date"]
                date = datetime.strptime(date, "%m/%d/%Y")
                action = rec["Action"]
                leg_vote = rec["Legislator_Vote"]
                vote_id = bill_id + "_" + action
                vote_id = vote_id.replace(" ", "_")
                passed = None

                if vote_id not in votes:
                    votes[vote_id] = Vote(chamber, date, action, passed, None,
                                          None, None, bill_id=bill_id)
                if leg_vote == "Y":
                    votes[vote_id].yes(leg)
                elif leg_vote == "N":
                    votes[vote_id].no(leg)
                else:
                    votes[vote_id].other(leg)

            #Counts yes/no/other votes and saves overall vote
            for vote in votes.itervalues():
                vote_yes_count = len(vote["yes_votes"])
                vote_no_count = len(vote["no_votes"])
                vote_other_count = len(vote["other_votes"])
                vote["yes_count"] = vote_yes_count
                vote["no_count"] = vote_no_count
                vote["other_count"] = vote_other_count
                if vote_yes_count > vote_no_count:
                    vote["passed"] = True
                else:
                    vote["passed"] = False
                vote_bill_id = vote["bill_id"]
                bill = bill_dict[vote_bill_id]
                bill.add_vote(vote)

        #Actions
        bill_action_url, bill_action_db = self.get_dbf(year_abr, 'BILLHIST')


        for rec in bill_action_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            action = rec["action"]
            date = rec["dateaction"]
            actor = rec["house"]
            comment = rec["comment"]
            action, atype = self.categorize_action(action)
            if comment:
                action += (' ' + comment)
            bill.add_action(actor, action, date, type=atype)

        # Subjects
        subject_url, subject_db = self.get_dbf(year_abr, 'BILLSUBJ')
        for rec in subject_db:
            bill_id = rec['billtype'] + str(int(rec['billnumber']))
            bill = bill_dict.get(bill_id)
            if bill:
                bill.setdefault('subjects', []).append(rec['subjectkey'])
            else:
                self.warning('invalid bill id in BILLSUBJ.DBF: %s' % bill_id)

        # save all bills at the end
        for bill in bill_dict.itervalues():
            # add sources
            bill.add_source(bill_sponsors_url)
            bill.add_source(bill_document_url)
            bill.add_source(bill_action_url)
            bill.add_source(subject_url)
            self.save_bill(bill)
Exemple #9
0
    def scrape_bill_pages(self, session, year_abr):

        #Main Bill information
        main_bill_url = 'ftp://www.njleg.state.nj.us/ag/%sdata/MAINBILL.DBF' % (year_abr)
        MAINBILL_dbf, resp = self.urlretrieve(main_bill_url)
        main_bill_db = dbf.Dbf(MAINBILL_dbf)
        bill_dict = {}

        for rec in main_bill_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["synopsis"]
            if bill_type[0] == 'A':
                chamber = "lower"
            else:
                chamber = "upper"
            bill = Bill(str(session), chamber, bill_id, title)
            bill.add_source(main_bill_url)
            bill_dict[bill_id] = bill

        #Sponsors
        bill_sponsors_url = 'ftp://www.njleg.state.nj.us/ag/%sdata/BILLSPON.DBF' % (year_abr)
        SPONSORS_dbf, resp = self.urlretrieve(bill_sponsors_url)
        bill_sponsors_db = dbf.Dbf(SPONSORS_dbf)

        for rec in bill_sponsors_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            name = rec["sponsor"]
            sponsor_type = rec["type"]
            if sponsor_type == 'P':
                sponsor_type = "Primary"
            else:
                sponsor_type = "Co-sponsor"
            bill.add_sponsor(sponsor_type, name)


        #Documents
        bill_document_url = 'ftp://www.njleg.state.nj.us/ag/%sdata/BILLWP.DBF' % (year_abr)
        DOC_dbf, resp = self.urlretrieve(bill_document_url)
        bill_document_db = dbf.Dbf(DOC_dbf)
        
        #print bill_document_db[2]
        for rec in bill_document_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            document = rec["document"]
            document = document.split('\\')
            doc_name = document[-1]
            document = document[-2] + "/" + document[-1]
            year = str(year_abr) + str((year_abr + 1))
            doc_url = "ftp://www.njleg.state.nj.us/%s" % year
            doc_url = doc_url + "/" + document
            bill.add_document(doc_name, doc_url)

        #Senate Votes
        file1 = 'A' + str(year_abr)
        file2 = 'A' + str(year_abr + 1)
        file3 = 'S' + str(year_abr)
        file4 = 'S' + str(year_abr + 1)
        if str(year_abr) != '2010':
            vote_info_list = [file1, file2, file3, file4]
        else:
            vote_info_list = [file1, file3]
        for bill_vote_file in vote_info_list:
            s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % bill_vote_file
            s_vote_zip, resp = self.urlretrieve(s_vote_url)
            zipedfile = zipfile.ZipFile(s_vote_zip)
            vfile = "%s.txt" % bill_vote_file
            vote_file = zipedfile.open(vfile, 'U')
            vdict_file = csv.DictReader(vote_file)

            votes = {}
            if bill_vote_file[0] == "A":
                chamber = "lower"
            else:
                chamber = "upper"

            for rec in vdict_file:
                bill_id = rec["Bill"]
                bill_id = bill_id.strip()
                leg = rec["Full_Name"]

                date = rec["Session_Date"]
                date = datetime.strptime(date, "%m/%d/%Y")
                action = rec["Action"]
                leg_vote = rec["Legislator_Vote"]
                vote_id = bill_id + "_" + action
                vote_id = vote_id.replace(" ", "_")
                passed = None
                
                if vote_id not in votes:
                    votes[vote_id] = Vote(chamber, date, action, passed, None, None, None, bill_id = bill_id)
                if leg_vote == "Y":
                    votes[vote_id].yes(leg)
                elif leg_vote == "N":
                    votes[vote_id].no(leg)
                else:
                    votes[vote_id].other(leg)

            #Counts yes/no/other votes and saves overall vote
            for vote in votes.itervalues():
                vote_yes_count = len(vote["yes_votes"])
                vote_no_count = len(vote["no_votes"])
                vote_other_count = len(vote["other_votes"])
                vote["yes_count"] = vote_yes_count
                vote["no_count"] = vote_no_count
                vote["other_count"] = vote_other_count
                if vote_yes_count > vote_no_count:
                    vote["passed"] = True
                else:
                    vote["passed"] = False
                vote_bill_id = vote["bill_id"]
                bill = bill_dict[vote_bill_id]
                bill.add_vote(vote)

        #Actions
        bill_action_url = 'ftp://www.njleg.state.nj.us/ag/%sdata/BILLHIST.DBF' % (year_abr)
        ACTION_dbf, resp = self.urlretrieve(bill_action_url)
        bill_action_db = dbf.Dbf(ACTION_dbf)
        bill.add_source(bill_sponsors_url)
        bill.add_source(bill_document_url)
        bill.add_source(bill_action_url)

        for rec in bill_action_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            action = rec["action"]
            date = rec["dateaction"]
            actor = rec["house"]
            comment = rec["comment"]
            bill.add_action(actor, action, date, comment = comment)
            self.save_bill(bill)
Exemple #10
0
 def scrape_bill(self, chamber, session, bill_id):
     """
     Scrapes documents, actions, vote counts and votes for 
     a given bill.
     """
     session_id = self.get_session_id(session)
     url = BASE_URL + 'DocumentsForBill.asp?Bill_Number=%s&Session_ID=%s' % (
                                        bill_id.replace(' ', ''), session_id)
     with self.urlopen(url) as docs_for_bill:
         root = html.fromstring(docs_for_bill)
         bill_title = root.xpath(
                         '//div[@class="ContentPageTitle"]')[1].text.strip()
         b_type = utils.get_bill_type(bill_id)
         bill = Bill(session, chamber, bill_id, bill_title, type=b_type)
         bill.add_source(url)
         path = '//tr[contains(td/font/text(), "%s")]'
         link_path = '//tr[contains(td/a/@href, "%s")]'
         link_path2 = '//tr[contains(td/font/a/@href, "%s")]'
         # versions
         rows = root.xpath(path % 'd Version')
         for row in rows:
             tds = row.cssselect('td')
             bill_version = tds[1].text_content().strip()
             bill_html = tds[2].xpath('string(font/a/@href)')
             bill.add_version(bill_version, bill_html)
                                         
         #fact sheets and summary
         rows = root.xpath(link_path2 % '/summary/')
         for row in rows:
             tds = row.cssselect('td')
             fact_sheet = tds[1].text_content().strip()
             fact_sheet_url = tds[1].xpath('string(font/a/@href)')
             bill.add_document(fact_sheet, fact_sheet_url, type="summary")
                 
         #agendas
         # skipping revised, cancelled, date, time and room from agendas
         # but how to get the agenda type cleanly? meaning whether it is 
         # house or senate?
         rows = root.xpath(link_path % '/agendas')
         for row in rows:
             tds = row.cssselect('td')
             agenda_committee = tds[0].text_content().strip()
             agenda_html = tds[7].xpath('string(a/@href)').strip()
             if agenda_html == '':
                 agenda_html = tds[6].xpath('string(a/@href)').strip()
             bill.add_document(agenda_committee, agenda_html)
             
         # House Calendars
         # skipping calendar number, modified, date
         rows = root.xpath(link_path % '/calendar/h')
         for row in rows:
             tds = row.cssselect('td')
             calendar_name = tds[0].text_content().strip()
             calendar_html = tds[5].xpath('string(a/@href)')
             bill.add_document(calendar_name, calendar_html, 
                               type='house calendar')
         # Senate Calendars
         # skipping calendar number, modified, date
         rows = root.xpath(link_path % '/calendar/s')
         for row in rows:
             tds = row.cssselect('td')
             calendar_name = tds[0].text_content().strip()
             calendar_html = tds[5].xpath('string(a/@href)')
             bill.add_document(calendar_name, calendar_html, 
                               type='senate calendar')
         # amendments
         rows = root.xpath(path % 'AMENDMENT:')
         for row in rows:
             tds = row.cssselect('td')
             amendment_title = tds[1].text_content().strip()
             amendment_link = tds[2].xpath('string(font/a/@href)')
             bill.add_document(amendment_title, amendment_link, 
                               type='amendment')
         
         # videos
         # http://azleg.granicus.com/MediaPlayer.php?view_id=13&clip_id=7684
         rows = root.xpath(link_path % '&clip_id')
         for row in rows:
             tds = row.cssselect('td')
             video_title = tds[1].text_content().strip()
             video_link = tds[2].xpath('string(a/@href)')
             video_date = tds[0].text_content().strip()
             bill.add_document(video_title, video_link, date=video_date, 
                               type='video')
             
     self.scrape_actions(chamber, session, bill)    
Exemple #11
0
    def scrape(self, chamber, year):
        if year not in metadata['sessions']:
            raise NoDataForYear(year)

        start_char = 'S' if chamber == 'upper' else 'H'

        nm_locator_url = 'http://legis.state.nm.us/lcs/locator.aspx'
        with self.urlopen(nm_locator_url) as page:
            page = BeautifulSoup(page)
            #The first `tr` is simply 'Bill Locator`. Ignoring that
            data_table = page.find('table', id = 'ctl00_mainCopy_Locators')('tr')[1:]
            for session in data_table:
                session_tag = session.find('a')
                session_name = ' '.join([tag.string.strip() for tag in session_tag('span')]).strip()

                if year not in session_name:
                    continue

                session_url = get_abs_url(nm_locator_url, session_tag['href'])
                with self.urlopen(session_url) as session_page:
                    session_page = BeautifulSoup(session_page)
                    bills_data_table = session_page.find('table', id = 'ctl00_mainCopy_LocatorGrid')('tr')[1:]
                    for bill in bills_data_table:
                        data = bill('td')

                        bill_num_link = data[0].find('a')
                        bill_num = ''.join([tag.string.strip() if tag.string else '' for tag in bill_num_link('span')]).strip()
                        bill_num = bill_num[1:] if bill_num.startswith('*') else bill_num
                        if not bill_num.startswith(start_char):
                            self.log('Skipping %s. This bill is not for the relevant chamber %s.' % (bill_num, chamber))
                            continue

                        bill_title = data[1].string.strip()
                        #For now, removing the '*' in front of the bill # (* means emergency)

                        bill_url = get_abs_url(session_url, bill_num_link['href'].replace(' ', ''))

                        bill = Bill(session = session_name, chamber = chamber, bill_id = bill_num, title = bill_title)
                        bill.add_source(bill_url)

                        with self.urlopen(bill_url) as bill_page:
                            bill_page = BeautifulSoup(bill_page)
                            sponsor_data = bill_page.find('table', id = 'ctl00_mainCopy__SessionFormView')
                            #The last link in this block will be the link to 'Key to Abbreviations'. Ignoring it.
                            for sponsor_link in sponsor_data('a')[:-1]:
                                #We will always have one extra 'a' tag than required - and it's 'span' strings will be empty.
                                #need to check for that condition.
                                sponsor_name = ' '.join([tag.string.strip() if tag.string else '' for tag in sponsor_link('span')]).strip()
                                if sponsor_name != '':
                                    bill.add_sponsor(type = 'primary', name = sponsor_name)

                            bill.add_version(**self.get_doc_data(bill_url, bill_page.find('table', id = 'ctl00_mainCopy_Introduced')))

                            committee_data = bill_page.find('table', id = 'ctl00_mainCopy_CommReportsList')
                            if committee_data:
                                for comms_data in committee_data('tr'):
                                    bill.add_document(**self.get_doc_data(bill_url, comms_data))

                            fir_data = bill_page.find('table', id = 'ctl00_mainCopy_FIRs')
                            if fir_data:
                                bill.add_document(**self.get_doc_data(bill_url, fir_data))

                            fin_ver_data = bill_page.find('table', id = 'ctl00_mainCopy_FinalVersion')
                            if fin_ver_data:
                                bill.add_version(**self.get_doc_data(bill_url, fin_ver_data))

                        self.save_bill(bill)
Exemple #12
0
    def scrape_bills(self, chamber_to_scrape, session):
        url = 'http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml' % session

        with self.urlopen(url) as bill_dir_page:
            root = lxml.etree.fromstring(bill_dir_page, lxml.etree.HTMLParser())
            for mr in root.xpath('//lastaction/msrgroup'):
                bill_id = mr.xpath('string(measure)').replace(" ", "")
                if bill_id[0] == "S":
                    chamber = "upper"
                else:
                    chamber = "lower"

                bill_type = {'B':'bill', 'C': 'concurrent resolution',
                             'R': 'resolution', 'N': 'nomination'}[bill_id[1]]

                # just skip past bills that are of the wrong chamber
                if chamber != chamber_to_scrape:
                    continue

                link = mr.xpath('string(actionlink)').replace("..", "")
                main_doc = mr.xpath('string(measurelink)').replace("../../../", "")
                main_doc_url = 'http://billstatus.ls.state.ms.us/%s' % main_doc
                bill_details_url = 'http://billstatus.ls.state.ms.us/%s/pdf/%s' % (session, link)
                with self.urlopen(bill_details_url) as details_page:
                    details_page = details_page.decode('latin1').encode('utf8', 'ignore')
                    details_root = lxml.etree.fromstring(details_page, lxml.etree.HTMLParser())
                    title = details_root.xpath('string(//shorttitle)')
                    longtitle = details_root.xpath('string(//longtitle)')

                    bill = Bill(session, chamber, bill_id, title,
                                type=bill_type, longtitle=longtitle)

                    #sponsors
                    main_sponsor = details_root.xpath('string(//p_name)').split()
                    if main_sponsor:
                        main_sponsor = main_sponsor[0]
                        main_sponsor_link = details_root.xpath('string(//p_link)').replace(" ", "_")
                        main_sponsor_url =  'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, main_sponsor_link)
                        type = "primary"
                        bill.add_sponsor(type, main_sponsor, main_sponsor_url = main_sponsor_url)
                    for author in details_root.xpath('//authors/additional'):
                        leg = author.xpath('string(co_name)').replace(" ", "_")
                        leg_url = 'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, leg)
                        type = "cosponsor"
                        bill.add_sponsor(type, leg, leg_url=leg_url)

                    #Versions 
                    curr_version = details_root.xpath('string(//current_other)').replace("../../../../", "")
                    curr_version_url = "http://billstatus.ls.state.ms.us/" + curr_version
                    bill.add_version("Current version", curr_version_url)

                    intro_version = details_root.xpath('string(//intro_other)').replace("../../../../", "")
                    intro_version_url = "http://billstatus.ls.state.ms.us/" + intro_version
                    bill.add_version("As Introduced", intro_version_url)

                    comm_version = details_root.xpath('string(//cmtesub_other)').replace("../../../../", "")
                    if comm_version.find("documents") != -1:
                        comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version
                        bill.add_version("Committee Substitute", comm_version_url)

                    passed_version = details_root.xpath('string(//passed_other)').replace("../../../../", "")
                    if passed_version.find("documents") != -1:
                        passed_version_url = "http://billstatus.ls.state.ms.us/" + passed_version
                        title = "As Passed the " + chamber
                        bill.add_version(title, passed_version_url)

                    asg_version = details_root.xpath('string(//asg_other)').replace("../../../../", "")
                    if asg_version.find("documents") != -1:
                        asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version
                        bill.add_version("Approved by the Governor", asg_version_url)


                    #Actions
                    for action in details_root.xpath('//history/action'):
                        action_num  = action.xpath('string(act_number)').strip()
                        action_num = int(action_num)
                        act_vote = action.xpath('string(act_vote)').replace("../../../..", "")
                        action_desc = action.xpath('string(act_desc)')
                        date, action_desc = action_desc.split(" ", 1)
                        date = date + "/" + session[0:4]
                        date = datetime.strptime(date, "%m/%d/%Y")

                        if action_desc.startswith("(H)"):
                            actor = "lower"
                            action = action_desc[4:]
                        elif action_desc.startswith("(S)"):
                            actor = "upper"
                            action = action_desc[4:]
                        else:
                            actor = "executive"
                            action = action_desc

                        if action.find("Veto") != -1:
                            version_path = details_root.xpath("string(//veto_other)")
                            version_path = version_path.replace("../../../../", "")
                            version_url = "http://billstatus.ls.state.ms.us/" + version_path
                            bill.add_document("Veto", version_url) 

                        atype = 'other'
                        for prefix, prefix_type in self._action_types:
                            if action.startswith(prefix):
                                atype = prefix_type
                                break

                        bill.add_action(actor, action, date, type=atype,
                                        action_num=action_num)

                        if act_vote:
                            vote_url = 'http://billstatus.ls.state.ms.us%s' % act_vote
                            vote = self.scrape_votes(vote_url, action, date, actor)
                            bill.add_vote(vote)
                            bill.add_source(vote_url)

                    bill.add_source(bill_details_url)
                    self.save_bill(bill)
Exemple #13
0
    def scrape_bill(self, chamber, session, bill_id):
        session_id = self.get_session_id(session)
        url = base_url + 'DocumentsForBill.asp?Bill_Number=%s&Session_ID=%s' % (
                                                            bill_id, session_id)
        with self.urlopen(url) as docs_for_bill:
            root = html.fromstring(docs_for_bill)
            bill_title = root.xpath(
                            '//div[@class="ContentPageTitle"]')[1].text.strip()
            # Depending on the progress the bill has made through the house
            # some table might not exist, the links that have javascript:Show****
            # have a table with related documents/calanders/agendas/versions
            # I am skipping the sponsors link because that information is on the
            # bill overview page where all of the actions are found.
            doc_section_links = root.xpath(
                                    '//a[contains(@href, "javascript:Show")]')
            bill = Bill(session, chamber, bill_id, bill_title)
            bill.type = self.get_bill_type(bill_id[:-4])
            bill.add_source(url)
            for link in doc_section_links:
                link_id = utils.parse_link_id(link)
                link_text = link.text_content().strip()
                div_path = '//div[@id="%s"]/table//tr' % link_id
                if link_text == 'Show Versions':
                    # the first row has only a comment
                    for tr in root.xpath(div_path)[1:]:
                        tds = tr.cssselect('td') # list(tr.iterchildren('td'))
                        if len(tds) >= 4:
                            bill_version = tds[1].text_content().strip()
                            bill_html = tds[2].xpath('string(font/a/@href)')
                            bill_pdf = tds[3].xpath('string(font/a/@href)')
                            bill.add_version(bill_version, 
                                                    bill_html, pdf_url=bill_pdf)
                elif link_text == 'Show Summaries/Fact Sheets':
                    for tr in root.xpath(div_path)[1:]:
                        # the first row has only a comment
                        tds = tr.cssselect('td')
                        if len(tds) > 1:
                            fact_sheet = tds[1].text_content().strip()
                            fact_sheet_url = tds[1].xpath(
                                                        'string(font/a/@href)')
                            bill.add_document(fact_sheet,
                                             fact_sheet_url, type="fact sheet")
                elif link_text in ('Show Senate Agendas', 'Show House Agendas'):
                    agenda_type = 'House Agenda' if re.match('House', link_text) else 'Senate Agenda'
                    for tr in root.xpath(div_path)[2:]:
                        # the first row has only a comment
                        # the second row is the table header
                        tds = tr.cssselect('td')
                        if len(tds) >= 8:
                            agenda_committee = tds[0].text_content().strip()
                            agenda_revised = tds[1].text.strip()
                            agenda_cancelled = tds[2].text.strip()
                            agenda_date = tds[3].text_content().strip()
                            agenda_time = tds[4].text_content().strip()
                            agenda_room = tds[5].text_content().strip()
                            agenda_pdf = tds[6].xpath('string(a/@href)').strip()
                            agenda_html = tds[7].xpath('string(a/@href)').strip()
                            bill.add_document(agenda_committee, 
                                                agenda_html, type=agenda_type)
                elif link_text in ('Show Senate Calendars',
                                    'Show House Calendar'):
                    cal_type = 'house calendar' if re.match('House', link_text) else 'senate calendar'
                    for tr in root.xpath(div_path)[2:]:
                        # the first row has only a comment
                        # the second row is the table header
                        tds = tr.cssselect('td')
                        if len(tds) >= 6:
                            calendar_name = tds[0].text_content().strip()
                            calendar_number = tds[1].text_content().strip()
                            calendar_modified = True if tds[2].xpath('img') else False 
                            calendar_date = tds[3].text_content().strip()
                            calendar_html = tds[5].xpath('string(a/@href)')
                            bill.add_document(calendar_name, 
                                                calendar_html, type="calendar")
                elif link_text == 'Show Adopted Amendments':
                    for tr in root.xpath(div_path)[1:]:
                        tds = tr.cssselect('td')
                        amendment_title = tds[1].text_content().strip()
                        amendment_link = tds[2].xpath('string(font/a/@href)')
                        bill.add_document(amendment_title, 
                                            amendment_link, type='amendment')        
                elif link_text == 'Show Proposed Amendments':
                    for tr in root.xpath(div_path)[1:]:
                        tds = tr.cssselect('td')
                        if len(tds) >= 3: 
                            amendment_title = tds[1].text_content().strip()
                            amendment_link = tds[2].xpath('string(font/a/@href)')
                            bill.add_document(amendment_title,
                                               amendment_link, type='amendment')        
                elif link_text == 'Show Bill Videos':
                    for tr in root.xpath(div_path)[2:]:
                        tds = tr.cssselect('td')
                        if len(tds) >= 3:
                            video_title = tds[1].text_content().strip()
                            video_link = tds[2].xpath('string(a/@href)')
                            video_date = tds[0].text_content().strip()
                            bill.add_document(video_title, video_link, 
                                                date=video_date, type='video')

        # action_url = 'http://www.azleg.gov/FormatDocument.asp?inDoc=/legtext/49leg/2r/bills/hb2001o.asp'
        # again the actions page may or may not have a given table and the order
        # of the actions depends on the chamber the bill originated in. 
        ses_num = utils.legislature_to_number(session)
        action_url = base_url + 'FormatDocument.asp?inDoc=/legtext/%s/bills/%so.asp' % (ses_num, bill_id.lower())
        with self.urlopen(action_url) as action_page:
            bill.add_source(action_url)
            root = html.fromstring(action_page)
            action_tables = root.xpath('/html/body/div/table/tr[3]/td[4]/table/tr/td/table/tr/td/table')
            for table in action_tables:
                rows = table.cssselect('tr')
                house = False if chamber == 'upper' else True
                action = table.cssselect('td')[0].text_content().strip()[:-1]
                if action == 'SPONSORS':
                    if len(rows[0]) == 4:
                        for row in rows:
                            tds = row.cssselect('td')
                            sponsors = [tds[i:i+2:] for i in range(1, len(tds), 2)]
                            bill.add_sponsor(sponsors[0][1].text_content().strip(), 
                                             sponsors[0][0].text_content().strip(),
                                             sponsor_link=sponsors[0][0].xpath('string(a/@href)'))
                elif action == 'COMMITTEES':
                    # the html for this table has meta tags that give the chamber
                    # and the committee abreviation
                    # <meta name="HCOMMITTEE" content="RULES">
                    # question for actions: in the case of committees would House
                    # Rules be better for an actor? 
                    for row in rows[1:]:
                        tds = row.cssselect('td')
                        meta_tag = row.cssselect('meta')[0]
                        actor = "%s:%s" % (meta_tag.get('name'), meta_tag.get('content'))
                        committee = meta_tag.get('content')
                        act = 'committee:reffered'
                        date = datetime.datetime.strptime(tds[1].text_content().strip(), '%m/%d/%y')
                        bill.add_action(actor, act, date, type='committee:referred')
                        if len(tds) == 5:
                            if re.match('\d{2}/\d{2}/\d{2}', tds[3].text_content().strip()):
                                date = datetime.datetime.strptime(tds[3].text_content().strip(), '%m/%d/%y')
                            else:
                                date = datetime.datetime.strptime(tds[1].text_content().strip(), '%m/%d/%y')
                            act = tds[4].text_content().strip()
                            status = 'other'
                            bill.add_action(actor, act, date, type=status, status=status)
                        elif len(tds) == 6:
                            where, committee = actor.split(':')
                            where = 'lower' if where == 'HCOMMITTEE' else 'upper'
                            date = datetime.datetime.strptime(tds[3].text_content().strip(), '%m/%d/%y')
                            vote = tds[4].text_content().strip()[1:-1]
                            if len(vote.split('-')) == 4:
                                yes, no, nv, exc = vote.split('-')
                            else:
                                yes, no, excused, absent, nv = vote.split('-')
                            motion = tds[5].text_content().strip()
                            passed = True if yes > no else False
                            vote = Vote(where, date, motion, passed, int(yes), int(no), int(nv), committee=committee)
                            vote.add_source(tds[0].xpath('string(a/@href)').strip())
                            
                            bill.add_vote(vote)
                elif action in ('HOUSE FIRST READ', 'HOUSE SECOND READ'):
                    aType = 'other'
                    if re.search('HOUSE FIRST', action):
                        aType = 'committee:referred'
                    bill.add_action('lower', action, utils.get_date(rows[0][1]),
                                     type=aType)
                elif action in ('SENATE FIRST READ', 'SENATE SECOND READ'):
                    aType = 'other'
                    if re.search('SECOND', action):
                        aType = 'committee:referred'
                    bill.add_action('upper', action, utils.get_date(rows[0][1]),
                                     type=aType)
                elif action in ('TRANSMIT TO HOUSE', 'TRANSMIT TO SENATE'):
                    actor = 'lower' if re.match('HOUSE', action) else 'upper'
                    house = True if actor == 'lower' else False
                    date = utils.get_date(rows[0][1])
                    bill.add_action(actor, action, date)
                elif re.match('COW ACTION \d', action):
                    actor = 'lower' if house else 'upper'
                    for row in rows[1:]:
                        date = utils.get_date(row[1])
                        bill.add_action(actor, action, date, motion=row[2].text_content().strip())
                elif action in ('HOUSE FINAL READ', 'SENATE FINAL READ', 'THIRD READ'):
                    actor = 'lower' if house else 'upper'
                    for row in rows[1:]:
                        if row[0].text_content().strip() == 'Vote Detail':
                            if len(row.getchildren()) == 10:
                                detail, date, ayes, nays, nv, exc, emer, rfe, two_thirds, result = [ x.text_content().strip() for x in row ]
                                print action_url
                                passed = True if result == 'PASSED' else False
                                motion = action
                                date = datetime.datetime.strptime(date, '%m/%d/%y') if date else ''
                                vote = Vote(actor, date, motion, passed, int(ayes), int(nays), int(nv),
                                             excused=int(exc), emergency=emer,  rfe=rfe, 
                                             two_thirds_vote=two_thirds, type="passage")
                                vote.add_source(row[0].xpath('string(a/@href)').strip())
                                bill.add_vote(vote)
                            elif len(row.getchildren()) == 11:
                                detail, date, ayes, nays, nv, exc, emer, amend, rfe, two_thirds, result = [ x.text_content().strip() for x in row ]
                                passed = True if result == 'PASSED' else False
                                motion = action
                                date = datetime.datetime.strptime(date, '%m/%d/%y') if date else ''
                                vote = Vote(actor, date, motion, passed, int(ayes), int(nays), int(nv),
                                             excused=int(exc), emergency=emer, amended=amend,
                                              rfe=rfe, two_thirds_vote=two_thirds, type="passage")
                                vote.add_source(row[0].xpath('string(a/@href)').strip())
                                bill.add_vote(vote)
                        
                elif action == 'TRANSMITTED TO':
                    actor = 'lower' if house else 'upper'
                    act = action + ": " + rows[0][1].text_content().strip()
                    date = rows[0][2].text_content().strip()
                    date = datetime.datetime.strptime(date, '%m/%d/%y')
                    bill.add_action(actor, act, date, type='governor:received')
                    # need action and chaptered, chaptered version if they exists
                    act, date, chapter, version = '', '', '', ''
                    for row in rows[1:]:
                        if row[0].text_content().strip() == 'ACTION:':
                            act = row[1].text_content().strip()
                            date = datetime.datetime.strptime(row[2].text_content().strip(), '%m/%d/%y')
                        elif row[0].text_content().strip() == 'CHAPTER':
                            chapter = row[1].text_content().strip()
                        elif row[0].text_content().strip() == 'CHAPTERED VERSION':
                            version = row[1].text_content.strip()
                    if act:
                        action_type = 'governor:signed' if act == 'SIGNED' else 'governor:vetoed'
                        if chapter:
                            bill.add_action('governor', act, date, 
                                            type=action_type, chapter=chapter, 
                                            chaptered_version=version)
                        else:
                            bill.add_action('governor', act, date, 
                                                type=action_type)
        self.save_bill(bill)
        self.log("saved: " + bill['bill_id'])
Exemple #14
0
    def scrape_bills(self, session):
        url = 'http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml' % session

        with self.urlopen(url) as bill_dir_page:
            root = lxml.etree.fromstring(bill_dir_page, lxml.etree.HTMLParser())
            for mr in root.xpath('//lastaction/msrgroup'):
                bill_id = mr.xpath('string(measure)').replace(" ", "")
                if bill_id[0] == "S":
                    chamber = "upper"
                else:
                    chamber = "lower"
                link = mr.xpath('string(actionlink)').replace("..", "")
                main_doc = mr.xpath('string(measurelink)').replace("../../../", "")
                main_doc_url = 'http://billstatus.ls.state.ms.us/%s' % main_doc
                bill_details_url = 'http://billstatus.ls.state.ms.us/%s/pdf/%s' % (session, link)
                with self.urlopen(bill_details_url) as details_page:
                    details_page = details_page.decode('latin1').encode('utf8', 'ignore')
                    details_root = lxml.etree.fromstring(details_page, lxml.etree.HTMLParser())
                    title = details_root.xpath('string(//shorttitle)')
                    longtitle = details_root.xpath('string(//longtitle)')

                    bill = Bill(session, chamber, bill_id, title, longtitle = longtitle)

                    #sponsors
                    main_sponsor = details_root.xpath('string(//p_name)').split()[0]
                    main_sponsor_link = details_root.xpath('string(//p_link)').replace(" ", "_")
                    main_sponsor_url =  'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, main_sponsor_link)
                    type = "Primary sponsor"
                    bill.add_sponsor(type, main_sponsor, main_sponsor_url = main_sponsor_url)
                    for author in details_root.xpath('//authors/additional'):
                        leg = author.xpath('string(co_name)').replace(" ", "_")
                        leg_url = 'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, leg)
                        type = "additional sponsor"
                        bill.add_sponsor(type, leg, leg_url=leg_url)


                    #Versions 
                    curr_version = details_root.xpath('string(//current_other)').replace("../../../../", "")
                    curr_version_url = "http://billstatus.ls.state.ms.us/" + curr_version
                    bill.add_version("Current version", curr_version_url)

                    intro_version = details_root.xpath('string(//intro_other)').replace("../../../../", "")
                    intro_version_url = "http://billstatus.ls.state.ms.us/" + intro_version
                    bill.add_version("As Introduced", intro_version_url)

                    comm_version = details_root.xpath('string(//cmtesub_other)').replace("../../../../", "")
                    if comm_version.find("documents") != -1:
                        comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version
                        bill.add_version("Committee Substitute", comm_version_url)

                    passed_version = details_root.xpath('string(//passed_other)').replace("../../../../", "")
                    if passed_version.find("documents") != -1:
                        passed_version_url = "http://billstatus.ls.state.ms.us/" + passed_version
                        title = "As Passed the " + chamber
                        bill.add_version(title, passed_version_url)                    
 
                    asg_version = details_root.xpath('string(//asg_other)').replace("../../../../", "")
                    if asg_version.find("documents") != -1:
                        asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version
                        bill.add_version("Approved by the Governor", asg_version_url)


                    #Actions
                    for action in details_root.xpath('//history/action'):
                        action_num  = action.xpath('string(act_number)').strip()
                        action_num = int(action_num)
                        action_desc = action.xpath('string(act_desc)')
                        act_vote = action.xpath('string(act_vote)').replace("../../../..", "")
                        date = action_desc.split()[0] + "/" + session[0:4]
                        date = datetime.strptime(date, "%m/%d/%Y")
                        try:
                            actor = action_desc.split()[2][1]
                            if actor == "H":
                                actor = "lower"
                            else:
                                actor = "upper"
                        except:
                            actor = "Executive"
                        action = action_desc[10: len(action_desc)]

                        if action.find("Veto") != -1:
                            version_path = details_root.xpath("string(//veto_other)")
                            version_path = version_path.replace("../../../../", "")
                            version_url = "http://billstatus.ls.state.ms.us/" + version_path
                            bill.add_document("Veto", version_url) 

                        bill.add_action(actor, action, date, action_num=action_num)                        

                        vote_url = 'http://billstatus.ls.state.ms.us%s' % act_vote
                        if vote_url != "http://billstatus.ls.state.ms.us":
                            vote =self.scrape_votes(vote_url, action, date, actor)
                            bill.add_vote(vote)
                    self.save_bill(bill)
Exemple #15
0
    def scrape(self, chamber, session):
        sep = "<h1>House</h1>"

        if chamber == "upper":
            after = False
            reg = "[5-9]"
        else:
            after = True
            reg = "[1-4]"

        year = str(year_from_session(session))

        with self.urlopen("http://apps.leg.wa.gov/billinfo/dailystatus.aspx?year=" + year) as page_html:
            page = lxml.html.fromstring(separate_content(page_html, sep))

            for element, attribute, link, pos in page.iterlinks():
                if re.search("bill=" + reg + "[0-9]{3}", link) != None:
                    bill_page_url = "http://apps.leg.wa.gov/billinfo/" + link
                    with self.urlopen(bill_page_url) as bill_page_html:
                        bill_page = lxml.html.fromstring(bill_page_html)
                        raw_title = bill_page.cssselect("title")
                        split_title = string.split(raw_title[0].text_content(), " ")
                        bill_id = split_title[0] + " " + split_title[1]
                        bill_id = bill_id.strip()

                        title_element = bill_page.get_element_by_id("ctl00_ContentPlaceHolder1_lblSubTitle")
                        title = title_element.text_content()

                        bill = Bill(session, chamber, bill_id, title)
                        bill.add_source(bill_page_url)

                        self.scrape_actions(bill_page, bill)

                        for element, attribute, link, pos in bill_page.iterlinks():
                            if re.search("billdocs", link) != None:
                                if re.search("Amendments", link) != None:
                                    bill.add_document("Amendment: " + element.text_content(), link)
                                elif re.search("Bills", link) != None:
                                    bill.add_version(element.text_content(), link)
                                else:
                                    bill.add_document(element.text_content(), link)
                            elif re.search("senators|representatives", link) != None:
                                with self.urlopen(link) as senator_page_html:
                                    senator_page = lxml.html.fromstring(senator_page_html)
                                    try:
                                        name_tuple = self.scrape_legislator_name(senator_page)
                                        bill.add_sponsor("primary", name_tuple[0])
                                    except:
                                        pass
                            elif re.search("ShowRollCall", link) != None:
                                match = re.search("([0-9]+,[0-9]+)", link)
                                match = match.group(0)
                                match = match.split(",")
                                id1 = match[0]
                                id2 = match[1]
                                url = votes_url(id1, id2)
                                with self.urlopen(url) as vote_page_html:
                                    vote_page = lxml.html.fromstring(vote_page_html)
                                    self.scrape_votes(vote_page, bill, url)

                        self.save_bill(bill)