Exemple #1
0
    def process_bill(self, data):
        chamber = parse_psuedo_id(data['from_organization'])['classification']
        if chamber == 'legislature':
            chamber = 'upper'
        bill = Bill(data['legislative_session'], chamber, data['identifier'],
                    data['title'], subjects=data['subject'],
                    type=data['classification'])
        if data['abstracts']:
            bill['summary'] = data['abstracts'][0]['abstract']
        bill.update(**data['extras'])

        for action in data['actions']:
            actor = parse_psuedo_id(action['organization_id'])['classification']
            legislators = []
            committees = []
            for rel in action['related_entities']:
                if rel['entity_type'] == 'organization':
                    committees.append(rel['name'])
                elif rel['entity_type'] == 'person':
                    legislators.append(rel['name'])
            bill.add_action(actor,
                            action['description'],
                            parse_date(action['date']),
                            type=_action_categories(action['classification']),
                            committees=committees,
                            legislators=legislators,
                            **action.get('extras', {}),
                            )

        for source in data['sources']:
            bill.add_source(source['url'])

        for sponsor in data['sponsorships']:
            bill.add_sponsor(sponsor['classification'],
                             sponsor['name'],
                             )

        for version in data['versions']:
            for link in version['links']:
                bill.add_version(version['note'], link['url'],
                                 mimetype=link['media_type'],
                                 date=parse_date(version['date']),
                                 **version.get('extras', {}))

        for doc in data['documents']:
            for link in doc['links']:
                bill.add_document(doc['note'], link['url'],
                                  mimetype=link['media_type'],
                                  date=parse_date(doc['date']),
                                  **doc.get('extras', {}))

        for title in data['other_titles']:
            bill.add_title(title['title'])

        for related in data['related_bills']:
            bill.add_companion(related['identifier'],
                               related['legislative_session'],
                               chamber
                               )
        self.save_bill(bill)
Exemple #2
0
    def get_bill_info(self, chamber, session, bill_detail_url, version_list_url):
        """
        Extracts all the requested info for a given bill.

        Calls the parent's methods to enter the results into JSON files.
        """
        chamber = 'lower' if chamber.lower() == 'house' else chamber
        chamber = 'upper' if chamber.lower() == 'senate' else chamber

        # Get html and parse
        doc = self.lxmlize(bill_detail_url)

        # Get the basic parts of the bill
        bill_id = self.get_node(doc, '//h1/text()')
        self.logger.debug(bill_id)
        bill_title_text = self.get_node(doc, '//h2[text()[contains(.,'
            '"Description")]]/following-sibling::p/text()')
        if bill_title_text is not None:
            bill_title = bill_title_text.strip()
        else:
            long_desc_url = self.get_node(doc, '//a[text()[contains(.,'
                '"Long Description")]]/@href')
            long_desc_page = self.lxmlize(long_desc_url)
            long_desc_text = self.get_node(long_desc_page, '//h1/'
                'following-sibling::p/text()')
            if long_desc_text is not None:
                bill_title = long_desc_text.strip()
            else:
                bill_title = 'No title found.'
                self.logger.warning('No title found for {}.'.format(bill_id))
        self.logger.debug(bill_title)
        bill_type = {'F': 'bill', 'R':'resolution',
                     'C': 'concurrent resolution'}[bill_id[1]]
        bill = Bill(session, chamber, bill_id, bill_title, type=bill_type)

        # Add source
        bill.add_source(bill_detail_url)

        # Add subjects.  Currently we are not mapping to Open States
        # standardized subjects, so use 'scraped_subjects'
        bill['scraped_subjects'] = self._subject_mapping[bill_id]

        # Get companion bill.
        companion = doc.xpath('//table[@class="status_info"]//tr[1]/td[2]/a[starts-with(@href, "?")]/text()')
        companion = self.make_bill_id(companion[0]) if len(companion) > 0 else None
        companion_chamber = self.chamber_from_bill(companion)
        if companion is not None:
          bill.add_companion(companion, chamber=companion_chamber)

        # Grab sponsors
        bill = self.extract_sponsors(bill, doc, chamber)

        # Add Actions performed on the bill.
        bill = self.extract_actions(bill, doc, chamber)

        # Get all versions of the bill.
        bill = self.extract_versions(bill, doc, chamber, version_list_url)

        self.save_bill(bill)
Exemple #3
0
    def get_bill_info(self, chamber, session, bill_detail_url,
                      version_list_url):
        """
        Extracts all the requested info for a given bill.

        Calls the parent's methods to enter the results into JSON files.
        """
        chamber = 'lower' if chamber.lower() == 'house' else chamber
        chamber = 'upper' if chamber.lower() == 'senate' else chamber

        # Get html and parse
        bill_html = self.urlopen(bill_detail_url)
        doc = lxml.html.fromstring(bill_html)

        # Get the basic parts of the bill
        bill_id = doc.xpath('//h1/text()')[0]
        bill_title = doc.xpath('//h2/following-sibling::p/text()')[0].strip()
        bill_type = {
            'F': 'bill',
            'R': 'resolution',
            'C': 'concurrent resolution'
        }[bill_id[1]]
        bill = Bill(session, chamber, bill_id, bill_title, type=bill_type)

        # Add source
        bill.add_source(bill_detail_url)

        # Add subjects.  Currently we are not mapping to Open States
        # standardized subjects, so use 'scraped_subjects'
        bill['scraped_subjects'] = self._subject_mapping[bill_id]

        # Get companion bill.
        companion = doc.xpath(
            '//table[@class="status_info"]//tr[1]/td[2]/a[starts-with(@href, "?")]/text()'
        )
        companion = self.make_bill_id(
            companion[0]) if len(companion) > 0 else None
        companion_chamber = self.chamber_from_bill(companion)
        if companion is not None:
            bill.add_companion(companion, chamber=companion_chamber)

        # Grab sponsors
        bill = self.extract_sponsors(bill, doc, chamber)

        # Add Actions performed on the bill.
        bill = self.extract_actions(bill, doc, chamber)

        # Get all versions of the bill.
        bill = self.extract_versions(bill, doc, chamber, version_list_url)

        self.save_bill(bill)
Exemple #4
0
    def scrape_bill(self, session, chamber, bill_type, url):
        bill_html = self.get(url).text
        bill_page = lxml.html.fromstring(bill_html)
        scraped_bill_id = bill_page.xpath(
            "//a[contains(@id, 'LinkButtonMeasure')]")[0].text_content()
        bill_id = scraped_bill_id.split(' ')[0]
        versions = bill_page.xpath(
            "//table[contains(@id, 'GridViewVersions')]")[0]

        tables = bill_page.xpath("//table")
        metainf_table = bill_page.xpath(
            '//div[contains(@id, "itemPlaceholder")]//table[1]')[0]
        action_table = bill_page.xpath(
            '//div[contains(@id, "UpdatePanel1")]//table[1]')[0]

        meta = self.parse_bill_metainf_table(metainf_table)

        subs = [s.strip() for s in meta['Report Title'].split(";")]
        if "" in subs:
            subs.remove("")

        b = Bill(session,
                 chamber,
                 bill_id,
                 title=meta['Measure Title'],
                 summary=meta['Description'],
                 referral=meta['Current Referral'],
                 subjects=subs,
                 type=bill_type)
        b.add_source(url)

        companion = meta['Companion'].strip()
        if companion:
            b['companion'] = companion

        prior = bill_page.xpath(
            "//table[@id='ctl00_ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()"
        )[-1]
        if 'carried over' in prior.lower():
            prior_session = '{} Regular Session'.format(
                str(int(session[:4]) - 1))
            b.add_companion(bill_id, prior_session, chamber)

        for sponsor in meta['Introducer(s)']:
            b.add_sponsor(type='primary', name=sponsor)

        actions = self.parse_bill_actions_table(b, action_table)
        versions = self.parse_bill_versions_table(b, versions)

        self.save_bill(b)
Exemple #5
0
    def get_bill_info(self, chamber, session, bill_detail_url, version_list_url):
        """
        Extracts all the requested info for a given bill.

        Calls the parent's methods to enter the results into JSON files.
        """
        chamber = 'lower' if chamber.lower() == 'house' else chamber
        chamber = 'upper' if chamber.lower() == 'senate' else chamber

        # Get html and parse
        bill_html = self.urlopen(bill_detail_url)
        doc = lxml.html.fromstring(bill_html)

        # Get the basic parts of the bill
        bill_id = doc.xpath('//h1/text()')[0]
        bill_title = doc.xpath('//h2/following-sibling::p/text()')[0].strip()
        bill_type = {'F': 'bill', 'R':'resolution',
                     'C': 'concurrent resolution'}[bill_id[1]]
        bill = Bill(session, chamber, bill_id, bill_title, type=bill_type)

        # Add source
        bill.add_source(bill_detail_url)

        # Add subjects.  Currently we are not mapping to Open States
        # standardized subjects, so use 'scraped_subjects'
        bill['scraped_subjects'] = self._subject_mapping[bill_id]

        # Get companion bill.
        companion = doc.xpath('//table[@class="status_info"]//tr[1]/td[2]/a[starts-with(@href, "?")]/text()')
        companion = self.make_bill_id(companion[0]) if len(companion) > 0 else None
        companion_chamber = self.chamber_from_bill(companion)
        if companion is not None:
          bill.add_companion(companion, chamber=companion_chamber)

        # Grab sponsors
        bill = self.extract_sponsors(bill, doc, chamber)

        # Add Actions performed on the bill.
        bill = self.extract_actions(bill, doc, chamber)

        # Get all versions of the bill.
        bill = self.extract_versions(bill, doc, chamber, version_list_url)

        self.save_bill(bill)
Exemple #6
0
    def scrape_bill(self, session, chamber, bill_type, url):
        bill_html = self.get(url).text
        bill_page = lxml.html.fromstring(bill_html)
        scraped_bill_id = bill_page.xpath(
            "//a[contains(@id, 'LinkButtonMeasure')]")[0].text_content()
        bill_id = scraped_bill_id.split(' ')[0]
        versions = bill_page.xpath( "//table[contains(@id, 'GridViewVersions')]" )[0]

        tables = bill_page.xpath("//table")
        metainf_table = bill_page.xpath('//div[contains(@id, "itemPlaceholder")]//table[1]')[0]
        action_table  = bill_page.xpath('//div[contains(@id, "UpdatePanel1")]//table[1]')[0]

        meta  = self.parse_bill_metainf_table(metainf_table)

        subs = [ s.strip() for s in meta['Report Title'].split(";") ]
        if "" in subs:
            subs.remove("")

        b = Bill(session, chamber, bill_id, title=meta['Measure Title'],
                 summary=meta['Description'],
                 referral=meta['Current Referral'],
                 subjects=subs,
                 type=bill_type)
        b.add_source(url)

        companion = meta['Companion'].strip()
        if companion:
            b['companion'] = companion

        prior = bill_page.xpath(
            "//table[@id='ctl00_ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()")[-1]
        if 'carried over' in prior.lower():
            prior_session = '{} Regular Session'.format(str(int(session[:4])-1))
            b.add_companion(bill_id, prior_session, chamber)

        for sponsor in meta['Introducer(s)']:
            b.add_sponsor(type='primary', name=sponsor)

        actions = self.parse_bill_actions_table(b, action_table)
        versions = self.parse_bill_versions_table(b, versions)

        self.save_bill(b)
Exemple #7
0
    def scrape_bills(self, session, year_abr):
        #Main Bill information
        main_bill_csv = self.access_to_csv('MainBill')

        # keep a dictionary of bills (mapping bill_id to Bill obj)
        bill_dict = {}

        for rec in main_bill_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["Synopsis"]
            if bill_type[0] == 'A':
                chamber = "lower"
            else:
                chamber = "upper"

            # some bills have a blank title.. just skip it
            if not title:
                continue

            bill = Bill(str(session), chamber, bill_id, title,
                        type=self._bill_types[bill_type[1:]])
            if rec['IdenticalBillNumber'].strip():
                bill.add_companion(rec['IdenticalBillNumber'].split()[0])

            # TODO: last session info is in there too
            bill_dict[bill_id] = bill

        #Sponsors
        bill_sponsors_csv = self.access_to_csv('BillSpon')

        for rec in bill_sponsors_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in sponsor database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            name = rec["Sponsor"]
            sponsor_type = rec["Type"]
            if sponsor_type == 'P':
                sponsor_type = "primary"
            else:
                sponsor_type = "cosponsor"
            bill.add_sponsor(sponsor_type, name)


        #Documents
        bill_document_csv = self.access_to_csv('BillWP')

        for rec in bill_document_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in document database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            document = rec["Document"]
            document = document.split('\\')
            document = document[-2] + "/" + document[-1]
            year = str(year_abr) + str((year_abr + 1))

            #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document)
            htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (year_abr,
                document.replace('.DOC', '.HTM'))

            # name document based _doctype
            try:
                doc_name = self._doctypes[rec['DocType']]
            except KeyError:
                raise Exception('unknown doctype %s on %s' %
                                (rec['DocType'], bill_id))
            if rec['Comment']:
                doc_name += ' ' + rec['Comment']

            if rec['DocType'] in self._version_types:
                # Clean HTMX links.
                if htm_url.endswith('HTMX'):
                    htm_url = re.sub('X$', '', htm_url)

                if htm_url.endswith('HTM'):
                    mimetype = 'text/html'
                elif htm_url.endswith('wpd'):
                    mimetype = 'application/vnd.wordperfect'
                bill.add_version(doc_name, htm_url, mimetype=mimetype)
            else:
                bill.add_document(doc_name, htm_url)

        # Votes
        next_year = int(year_abr)+1
        vote_info_list = ['A%s' % year_abr,
                          'A%s' % next_year,
                          'S%s' % year_abr,
                          'S%s' % next_year,
                          'CA%s-%s' % (year_abr, next_year),
                          'CS%s-%s' % (year_abr, next_year),
                         ]

        for filename in vote_info_list:
            s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename
            try:
                s_vote_zip, resp = self.urlretrieve(s_vote_url)
            except scrapelib.FTPError:
                self.warning('could not find %s' % s_vote_url)
                continue
            zipedfile = zipfile.ZipFile(s_vote_zip)
            for vfile in ["%s.txt" % (filename), "%sEnd.txt" % (filename)]:
                try:
                    vote_file = zipedfile.open(vfile, 'U')
                except KeyError:
                    #
                    # Right, so, 2011 we have an "End" file with more
                    # vote data than was in the original dump.
                    #
                    self.warning("No such file: %s" % (vfile))
                    continue

                vdict_file = csv.DictReader(vote_file)

                votes = {}
                if filename.startswith('A') or filename.startswith('CA'):
                    chamber = "lower"
                else:
                    chamber = "upper"

                if filename.startswith('C'):
                    vote_file_type = 'committee'
                else:
                    vote_file_type = 'chamber'

                for rec in vdict_file:

                    if vote_file_type == 'chamber':
                        bill_id = rec["Bill"].strip()
                        leg = rec["Full_Name"]

                        date = rec["Session_Date"]
                        action = rec["Action"]
                        leg_vote = rec["Legislator_Vote"]
                    else:
                        bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number'])
                        leg = rec['Name']
                        # drop time portion
                        date = rec['Agenda_Date'].split()[0]
                        # make motion readable
                        action = self._com_vote_motions[rec['BillAction']]
                        # first char (Y/N) use [0:1] to ignore ''
                        leg_vote = rec['LegislatorVote'][0:1]

                    date = datetime.strptime(date, "%m/%d/%Y")
                    vote_id = '_'.join((bill_id, chamber, action))
                    vote_id = vote_id.replace(" ", "_")

                    if vote_id not in votes:
                        votes[vote_id] = Vote(chamber, date, action, None, None,
                                              None, None, bill_id=bill_id)
                    if vote_file_type == 'committee':
                        votes[vote_id]['committee'] = self._committees[
                            rec['Committee_House']]

                    if leg_vote == "Y":
                        votes[vote_id].yes(leg)
                    elif leg_vote == "N":
                        votes[vote_id].no(leg)
                    else:
                        votes[vote_id].other(leg)

            # remove temp file
            os.remove(s_vote_zip)

            #Counts yes/no/other votes and saves overall vote
            for vote in votes.itervalues():
                vote_yes_count = len(vote["yes_votes"])
                vote_no_count = len(vote["no_votes"])
                vote_other_count = len(vote["other_votes"])
                vote["yes_count"] = vote_yes_count
                vote["no_count"] = vote_no_count
                vote["other_count"] = vote_other_count

                # Veto override.
                if vote['motion'] == 'OVERRIDE':
                    # Per the NJ leg's glossary, a veto override requires
                    # 2/3ds of each chamber. 27 in the senate, 54 in the house.
                    # http://www.njleg.state.nj.us/legislativepub/glossary.asp
                    vote['passed'] = False
                    if vote['chamber'] == 'lower':
                        if vote_yes_count >= 54:
                            vote['passed'] = True
                    elif vote['chamber'] == 'upper':
                        if vote_yes_count >= 27:
                            vote['passed'] = True

                # Regular vote.
                elif vote_yes_count > vote_no_count:
                    vote["passed"] = True
                else:
                    vote["passed"] = False
                vote_bill_id = vote["bill_id"]
                bill = bill_dict[vote_bill_id]
                bill.add_vote(vote)

        #Actions
        bill_action_csv = self.access_to_csv('BillHist')
        actor_map = {'A': 'lower', 'G': 'executive', 'S': 'upper'}

        for rec in bill_action_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in action database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            action = rec["Action"]
            date = rec["DateAction"]
            date = datetime.strptime(date, "%m/%d/%y %H:%M:%S")
            actor = actor_map[rec["House"]]
            comment = rec["Comment"]
            action, atype = self.categorize_action(action, bill_id)
            if comment:
                action += (' ' + comment)
            bill.add_action(actor, action, date, type=atype)

        # Subjects
        subject_csv = self.access_to_csv('BillSubj')
        for rec in subject_csv:
            bill_id = rec['BillType'].strip() + str(int(rec['BillNumber']))
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in subject database' % bill_id)
                continue
            bill = bill_dict.get(bill_id)
            if bill:
                bill.setdefault('subjects', []).append(rec['SubjectKey'])
            else:
                self.warning('invalid bill id in BillSubj: %s' % bill_id)

        phony_bill_count = 0
        # save all bills at the end
        for bill in bill_dict.itervalues():
            # add sources
            if not bill['actions'] and not bill['versions']:
                self.warning('probable phony bill detected %s',
                             bill['bill_id'])
                phony_bill_count += 1
            else:
                bill.add_source('http://www.njleg.state.nj.us/downloads.asp')
                self.save_bill(bill)

        if phony_bill_count:
            self.warning('%s total phony bills detected', phony_bill_count)
Exemple #8
0
class AssemblyBillPage(object):
    '''Get the actions, sponsors, sponsors memo and summary
    and assembly floor votes from the assembly page.
    '''

    def __init__(self, scraper, session, chamber, url, doc, bill_type,
                 bill_id, title, bill_id_parts):
        self.scraper = scraper
        self.chamber = chamber
        self.url = url
        self.doc = doc
        self.bill_id = bill_id
        self.letter, self.number, self.version = bill_id_parts
        self.data = {}
        self.bill = Bill(session, chamber, bill_id, title, type=bill_type)
        self.succeeded = False

        self._build()

    def _build(self):
        if not self.doc.xpath('//pre/text()'):
            return
        self.get_actions()
        self.get_sponsors_memo()
        self.get_sponsors()
        self.get_summary()
        self.get_companions()
        self.get_lower_votes()
        self.get_version()
        self.succeeded = True
        self.bill.add_source(self.url)

    def _get_chunks(self):
        if 'summary' not in self.data:
            url = ('http://assembly.state.ny.us/leg/?default_fld=&'
                   'bn=%s&Summary=Y&Actions=Y') % self.bill_id
            doc = self.url2lxml(url)
            summary, actions = doc.xpath('//pre/text()')
            self.data['summary'], self.data['actions'] = summary, actions
            return summary, actions
        else:
            return self.data['summary'], self.data['actions']

    def url2lxml(self, url):
        self.bill.add_source(url)
        return self.scraper.url2lxml(url)

    def get_version(self):
        url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn=' + self.bill_id
        version = self.bill_id
        self.bill.add_version(version, url, mimetype='text/html')

    def get_companions(self):
        summary, _ = self._get_chunks()
        chunks = summary.split('\n\n')
        for chunk in chunks:
            if chunk.startswith('SAME AS'):
                companions = chunk.replace('SAME AS    ', '')
                if companions != 'No same as':
                    for companion in re.split(r'\s*[\,\\]\s*', companions):
                        companion = re.sub(r'^Same as ', '', companion)
                        companion = re.sub(r'^Uni', '', companion)
                        companion = re.sub(r'\-\w+$', '', companion)
                        self.bill.add_companion(companion)

    def get_sponsors_memo(self):
        if self.chamber == 'lower':
            url = ('http://assembly.state.ny.us/leg/?'
                   'default_fld=&bn=%s&term=&Memo=Y') % self.bill_id
            self.bill.add_document("Sponsor's Memorandum", url)

    def get_summary(self):
        summary, _ = self._get_chunks()
        chunks = summary.split('\n\n')
        self.bill['summary'] = chunks[-1]

    def _scrub_name(self, name):
        junk = [
            r'^Rules\s+',
            '\(2nd Vice Chairperson\)',
            '\(MS\)',
            'Assemblyman',
            'Assemblywoman',
            'Senator']
        for rgx in junk:
            name = re.sub(rgx, '', name, re.I)
        return name.strip('(), ')

    def get_sponsors(self):
        summary, _ = self._get_chunks()
        chunks = summary.split('\n\n')
        for chunk in chunks:
            for sponsor_type in ('SPONSOR', 'COSPNSR', 'MLTSPNSR'):
                if chunk.startswith(sponsor_type):
                    _, data = chunk.split(' ', 1)
                    for sponsor in re.split(r',\s+', data.strip()):

                        if not sponsor:
                            continue

                        # If it's a "Rules" bill, add the Rules committee
                        # as the primary.
                        if sponsor.startswith('Rules'):
                            self.bill.add_sponsor('primary', 'Rules Committee',
                                                  chamber='lower')

                        sponsor = self._scrub_name(sponsor)

                        # Figure out sponsor type.
                        spons_swap = {'SPONSOR': 'primary'}
                        _sponsor_type = spons_swap.get(
                                            sponsor_type, 'cosponsor')

                        self.bill.add_sponsor(_sponsor_type, sponsor.strip(),
                                         official_type=sponsor_type)

    def get_actions(self):
        _, actions = self._get_chunks()
        categorizer = self.scraper.categorizer
        actions_rgx = r'(\d{2}/\d{2}/\d{4})\s+(.+)'
        actions_data = re.findall(actions_rgx, actions)
        for date, action in actions_data:
            date = datetime.datetime.strptime(date, r'%m/%d/%Y')
            act_chamber = ('upper' if action.isupper() else 'lower')
            types, attrs = categorizer.categorize(action)
            self.bill.add_action(act_chamber, action, date, type=types, **attrs)
            # Bail if the bill has been substituted by another.
            if 'substituted by' in action:
                return

    def get_lower_votes(self):

        url = ('http://assembly.state.ny.us/leg/?'
               'default_fld=&bn=%s&term=&Votes=Y')
        doc = self.url2lxml(url % self.bill_id)
        if doc is None:
            return

        pre = doc.xpath('//pre')[0].text_content()
        no_votes = ('There are no votes for this bill in this '
                    'legislative session.')
        if pre == no_votes:
            return

        actual_vote = collections.defaultdict(list)
        for table in doc.xpath('//table'):

            date = table.xpath('caption/label[contains(., "DATE:")]')
            date = date[0].itersiblings().next().text
            date = datetime.datetime.strptime(date, '%m/%d/%Y')

            votes = table.xpath('caption/span/label[contains(., "YEA/NAY:")]')
            votes = votes[0].itersiblings().next().text
            yes_count, no_count = map(int, votes.split('/'))

            passed = yes_count > no_count
            vote = Vote('lower', date, 'Floor Vote', passed, yes_count,
                        no_count, other_count=0)

            tds = table.xpath('tr/td/text()')
            votes = iter(tds)
            while True:
                try:
                    data = list(islice(votes, 2))
                    name, vote_val = data
                except (StopIteration, ValueError):
                    # End of data. Stop.
                    break
                name = self._scrub_name(name)

                if vote_val.strip() == 'Y':
                    vote.yes(name)
                elif vote_val.strip() in ('N', 'NO'):
                    vote.no(name)
                else:
                    vote.other(name)
                    actual_vote[vote_val].append(name)

            # The page doesn't provide an other_count.
            vote['other_count'] = len(vote['other_votes'])
            vote['actual_vote'] = actual_vote
            self.bill.add_vote(vote)
Exemple #9
0
    def process_bill(self, data):
        chamber = parse_psuedo_id(data['from_organization'])['classification']
        if chamber == 'legislature':
            chamber = 'upper'
        bill = Bill(data['legislative_session'],
                    chamber,
                    data['identifier'],
                    data['title'],
                    subjects=data['subject'],
                    type=data['classification'])
        if data['abstracts']:
            bill['summary'] = data['abstracts'][0]['abstract']
        bill.update(**data['extras'])

        for action in data['actions']:
            actor = parse_psuedo_id(
                action['organization_id'])['classification']
            legislators = []
            committees = []
            for rel in action['related_entities']:
                if rel['entity_type'] == 'organization':
                    committees.append(rel['name'])
                elif rel['entity_type'] == 'person':
                    legislators.append(rel['name'])
            bill.add_action(actor,
                            action['description'],
                            parse_date(action['date']),
                            type=_action_categories(action['classification']),
                            committees=committees,
                            legislators=legislators,
                            **action.get('extras', {}))

        for source in data['sources']:
            bill.add_source(source['url'])

        for sponsor in data['sponsorships']:
            bill.add_sponsor(
                sponsor['classification'],
                sponsor['name'],
            )

        for version in data['versions']:
            for link in version['links']:
                bill.add_version(version['note'],
                                 link['url'],
                                 mimetype=link['media_type'],
                                 date=parse_date(version['date']),
                                 **version.get('extras', {}))

        for doc in data['documents']:
            for link in doc['links']:
                bill.add_document(doc['note'],
                                  link['url'],
                                  mimetype=link['media_type'],
                                  date=parse_date(doc['date']),
                                  **doc.get('extras', {}))

        for title in data['other_titles']:
            bill.add_title(title['title'])

        for related in data['related_bills']:
            bill.add_companion(related['identifier'],
                               related['legislative_session'], chamber)

        bill['alternate_bill_ids'] = [
            oi['identifier'] for oi in data['other_identifiers']
        ]
        self.save_bill(bill)
Exemple #10
0
    def _scrape_bill(self, session, bill_data):
        details = self._parse_bill_details(bill_data)

        (senate_url, assembly_url, bill_chamber, bill_type, bill_id,
         title, (prefix, number, active_version)) = details

        bill = Bill(
            session,
            bill_chamber,
            bill_id,
            title,
            type=bill_type,
            summary=bill_data['summary'])

        if bill_data['title'] is None:
            bill['title'] = bill_data['summary']

        bill_active_version = bill_data['amendments']['items'][active_version]

        # Parse sponsors.
        if bill_data['sponsor']['rules'] == True:
            bill.add_sponsor('primary', 'Rules Committee',
                chamber=bill_chamber)
        elif not bill_data['sponsor']['budget']:
            primary_sponsor = bill_data['sponsor']['member']
            bill.add_sponsor('primary', primary_sponsor['shortName'])

            # There *shouldn't* be cosponsors if there is no sponsor.
            cosponsors = bill_active_version['coSponsors']['items']
            for cosponsor in cosponsors:
                bill.add_sponsor('cosponsor', cosponsor['shortName'])

        # List companion bill.
        same_as = bill_active_version.get('sameAs', {})
        # Check whether "sameAs" property is populated with at least one bill.
        if same_as['items']:
            # Get companion bill ID.
            companion_bill_id = same_as['items'][0]['basePrintNo']

            # Build companion bill session.
            start_year = same_as['items'][0]['session']
            end_year = start_year + 1
            companion_bill_session = '-'.join([str(start_year), str(end_year)])

            # Determine companion bill chamber.
            companion_bill_prefix = self._parse_bill_number(
                same_as['items'][0]['basePrintNo'])[0]
            companion_bill_chamber = self._parse_bill_prefix(
                companion_bill_prefix)[0]

            # Attach companion bill data.
            bill.add_companion(
                companion_bill_id,
                companion_bill_session,
                companion_bill_chamber,
            )

        # Parse actions.
        chamber_map = {
            'senate': 'upper',
            'assembly': 'lower',
        }

        for action in bill_data['actions']['items']:
            chamber = chamber_map[action['chamber'].lower()]
            action_datetime = datetime.datetime.strptime(action['date'],
                '%Y-%m-%d')
            action_date = action_datetime.date()
            types, attrs = NYBillScraper.categorizer.categorize(action['text'])

            bill.add_action(
                chamber,
                action['text'],
                action_date,
                type=types,
                **attrs)

        # Chamber-specific processing.
        if bill_chamber == 'upper':
            # Collect votes.
            for vote_data in bill_data['votes']['items']:
                vote = self._parse_senate_votes(vote_data)
                bill.add_vote(vote)
        elif bill_chamber == 'lower':
            assembly = AssemblyBillPage(self, session, bill, details)
            assembly.build()
            assembly_bill_data = assembly.bill

        # A little strange the way it works out, but the Assembly
        # provides the HTML version documents and the Senate provides
        # the PDF version documents.
        amendments = bill_data['amendments']['items']
        for key, amendment in amendments.iteritems():
            version = amendment['printNo']

            html_version = version + ' HTML'
            html_url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn='\
                '{}&term={}'.format(bill_id, self.term_start_year)
            bill.add_version(html_version, html_version, mimetype='text/html')

            pdf_version = version + ' PDF'
            pdf_url = 'http://legislation.nysenate.gov/pdf/bills/{}/{}'\
                .format(self.term_start_year, bill_id)
            bill.add_version(pdf_version, pdf_version,
                mimetype='application/pdf')

        # Handling of sources follows. Sources serving either chamber
        # maintain duplicate data, so we can see certain bill data
        # through either chamber's resources. However, we have to refer
        # to a specific chamber's resources if we want to grab certain
        # specific information such as vote data.
        #
        # As such, I'm placing all potential sources in the interest of
        # thoroughness. - Andy Lo

        # List Open Legislation API endpoint as a source.
        bill.add_source(self.api_client.root + self.api_client.\
            resources['bill'].format(
                session_year=session,
                bill_id=bill_id,
                summary='',
                detail=''))
        bill.add_source(senate_url)
        bill.add_source(assembly_url)

        return bill
Exemple #11
0
    def _scrape_bill(self, session, bill_data):
        details = self._parse_bill_details(bill_data)

        (senate_url, assembly_url, bill_chamber, bill_type, bill_id, title,
         (prefix, number, active_version)) = details

        bill = Bill(session,
                    bill_chamber,
                    bill_id,
                    title,
                    type=bill_type,
                    summary=bill_data['summary'])

        if bill_data['title'] is None:
            bill['title'] = bill_data['summary']

        bill_active_version = bill_data['amendments']['items'][active_version]

        # Parse sponsors.
        if bill_data['sponsor']['rules'] == True:
            bill.add_sponsor('primary',
                             'Rules Committee',
                             chamber=bill_chamber)
        elif not bill_data['sponsor']['budget']:
            primary_sponsor = bill_data['sponsor']['member']
            bill.add_sponsor('primary', primary_sponsor['shortName'])

            # There *shouldn't* be cosponsors if there is no sponsor.
            cosponsors = bill_active_version['coSponsors']['items']
            for cosponsor in cosponsors:
                bill.add_sponsor('cosponsor', cosponsor['shortName'])

        # List companion bill.
        same_as = bill_active_version.get('sameAs', {})
        # Check whether "sameAs" property is populated with at least one bill.
        if same_as['items']:
            # Get companion bill ID.
            companion_bill_id = same_as['items'][0]['basePrintNo']

            # Build companion bill session.
            start_year = same_as['items'][0]['session']
            end_year = start_year + 1
            companion_bill_session = '-'.join([str(start_year), str(end_year)])

            # Determine companion bill chamber.
            companion_bill_prefix = self._parse_bill_number(
                same_as['items'][0]['basePrintNo'])[0]
            companion_bill_chamber = self._parse_bill_prefix(
                companion_bill_prefix)[0]

            # Attach companion bill data.
            bill.add_companion(
                companion_bill_id,
                companion_bill_session,
                companion_bill_chamber,
            )

        # Parse actions.
        chamber_map = {
            'senate': 'upper',
            'assembly': 'lower',
        }

        for action in bill_data['actions']['items']:
            chamber = chamber_map[action['chamber'].lower()]
            action_datetime = datetime.datetime.strptime(
                action['date'], '%Y-%m-%d')
            action_date = action_datetime.date()
            types, attrs = NYBillScraper.categorizer.categorize(action['text'])

            bill.add_action(chamber,
                            action['text'],
                            action_date,
                            type=types,
                            **attrs)

        # Chamber-specific processing.
        if bill_chamber == 'upper':
            # Collect votes.
            for vote_data in bill_data['votes']['items']:
                vote = self._parse_senate_votes(vote_data)
                bill.add_vote(vote)
        elif bill_chamber == 'lower':
            assembly = AssemblyBillPage(self, session, bill, details)
            assembly.build()
            assembly_bill_data = assembly.bill

        # A little strange the way it works out, but the Assembly
        # provides the HTML version documents and the Senate provides
        # the PDF version documents.
        amendments = bill_data['amendments']['items']
        for key, amendment in amendments.iteritems():
            version = amendment['printNo']

            html_version = version + ' HTML'
            html_url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn='\
                '{}&term={}'.format(bill_id, self.term_start_year)
            bill.add_version(html_version, html_version, mimetype='text/html')

            pdf_version = version + ' PDF'
            pdf_url = 'http://legislation.nysenate.gov/pdf/bills/{}/{}'\
                .format(self.term_start_year, bill_id)
            bill.add_version(pdf_version,
                             pdf_version,
                             mimetype='application/pdf')

        # Handling of sources follows. Sources serving either chamber
        # maintain duplicate data, so we can see certain bill data
        # through either chamber's resources. However, we have to refer
        # to a specific chamber's resources if we want to grab certain
        # specific information such as vote data.
        #
        # As such, I'm placing all potential sources in the interest of
        # thoroughness. - Andy Lo

        # List Open Legislation API endpoint as a source.
        bill.add_source(self.api_client.root + self.api_client.\
            resources['bill'].format(
                session_year=session,
                bill_id=bill_id,
                summary='',
                detail=''))
        bill.add_source(senate_url)
        bill.add_source(assembly_url)

        return bill
Exemple #12
0
    def scrape_bill_pages(self, session, year_abr):
        """ assemble information on a bill from a number of DBF files
        """

        #Main Bill information
        main_bill_url, main_bill_db = self.get_dbf(year_abr, 'MAINBILL')

        # keep a dictionary of bills (mapping bill_id to Bill obj)
        bill_dict = {}

        for rec in main_bill_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["synopsis"]
            if bill_type[0] == 'A':
                chamber = "lower"
            else:
                chamber = "upper"

            # some bills have a blank title.. just skip it
            if not title:
                continue

            bill = Bill(str(session), chamber, bill_id, title,
                        type=self._bill_types[bill_type[1:]])
            if rec['identicalb']:
                bill.add_companion(rec['identicalb'].split()[0])
            # TODO: last session info is in there too
            bill.add_source(main_bill_url)
            bill_dict[bill_id] = bill

        #Sponsors
        bill_sponsors_url, bill_sponsors_db = self.get_dbf(year_abr, 'BILLSPON')

        for rec in bill_sponsors_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            name = rec["sponsor"]
            sponsor_type = rec["type"]
            if sponsor_type == 'P':
                sponsor_type = "primary"
            else:
                sponsor_type = "cosponsor"
            bill.add_sponsor(sponsor_type, name)


        #Documents
        bill_document_url, bill_document_db = self.get_dbf(year_abr, 'BILLWP')

        #print bill_document_db[2]
        for rec in bill_document_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in document database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            document = rec["document"]
            document = document.split('\\')
            document = document[-2] + "/" + document[-1]
            year = str(year_abr) + str((year_abr + 1))

            #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document)
            htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (year_abr,
                document.replace('.DOC', '.HTM'))

            # name document based _doctype
            try:
                doc_name = self._doctypes[rec['doctype']]
            except KeyError:
                raise Exception('unknown doctype %s on %s' %
                                (rec['doctype'], bill_id))
            if rec['comment']:
                doc_name += ' ' + rec['comment']

            if rec['doctype'] in self._version_types:
                if htm_url.endswith('HTM'):
                    mimetype = 'text/html'
                elif htm_url.endswith('wpd'):
                    mimetype = 'application/vnd.wordperfect'
                bill.add_version(doc_name, htm_url, mimetype=mimetype)
            else:
                bill.add_document(doc_name, htm_url)

        # Votes
        next_year = int(year_abr)+1
        vote_info_list = ['A%s' % year_abr,
                          'A%s' % next_year,
                          'S%s' % year_abr,
                          'S%s' % next_year,
                          'CA%s-%s' % (year_abr, next_year),
                          'CS%s-%s' % (year_abr, next_year),
                         ]

        for filename in vote_info_list:
            s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename
            try:
                s_vote_zip, resp = self.urlretrieve(s_vote_url)
            except scrapelib.FTPError:
                self.warning('could not find %s' % s_vote_url)
                continue
            zipedfile = zipfile.ZipFile(s_vote_zip)
            vfile = "%s.txt" % filename
            vote_file = zipedfile.open(vfile, 'U')
            vdict_file = csv.DictReader(vote_file)

            votes = {}
            if filename.startswith('A') or filename.startswith('CA'):
                chamber = "lower"
            else:
                chamber = "upper"

            if filename.startswith('C'):
                vote_file_type = 'committee'
            else:
                vote_file_type = 'chamber'

            for rec in vdict_file:

                if vote_file_type == 'chamber':
                    bill_id = rec["Bill"].strip()
                    leg = rec["Full_Name"]

                    date = rec["Session_Date"]
                    action = rec["Action"]
                    leg_vote = rec["Legislator_Vote"]
                else:
                    bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number'])
                    leg = rec['Name']
                    # drop time portion
                    date = rec['Agenda_Date'].split()[0]
                    # make motion readable
                    action = self._com_vote_motions[rec['BillAction']]
                    # first char (Y/N) use [0:1] to ignore ''
                    leg_vote = rec['LegislatorVote'][0:1]

                date = datetime.strptime(date, "%m/%d/%Y")
                vote_id = '_'.join((bill_id, chamber, action))
                vote_id = vote_id.replace(" ", "_")

                if vote_id not in votes:
                    votes[vote_id] = Vote(chamber, date, action, None, None,
                                          None, None, bill_id=bill_id)
                if vote_file_type == 'committee':
                    votes[vote_id]['committee'] = self._committees[
                        rec['Committee_House']]

                if leg_vote == "Y":
                    votes[vote_id].yes(leg)
                elif leg_vote == "N":
                    votes[vote_id].no(leg)
                else:
                    votes[vote_id].other(leg)

            # remove temp file
            os.remove(s_vote_zip)

            #Counts yes/no/other votes and saves overall vote
            for vote in votes.itervalues():
                vote_yes_count = len(vote["yes_votes"])
                vote_no_count = len(vote["no_votes"])
                vote_other_count = len(vote["other_votes"])
                vote["yes_count"] = vote_yes_count
                vote["no_count"] = vote_no_count
                vote["other_count"] = vote_other_count
                if vote_yes_count > vote_no_count:
                    vote["passed"] = True
                else:
                    vote["passed"] = False
                vote_bill_id = vote["bill_id"]
                bill = bill_dict[vote_bill_id]
                bill.add_vote(vote)

        #Actions
        bill_action_url, bill_action_db = self.get_dbf(year_abr, 'BILLHIST')
        actor_map = {'A': 'lower', 'G': 'executive', 'S': 'upper'}

        for rec in bill_action_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            action = rec["action"]
            date = rec["dateaction"]
            actor = actor_map[rec["house"]]
            comment = rec["comment"]
            action, atype = self.categorize_action(action, bill_id)
            if comment:
                action += (' ' + comment)
            bill.add_action(actor, action, date, type=atype)

        # Subjects
        subject_url, subject_db = self.get_dbf(year_abr, 'BILLSUBJ')
        for rec in subject_db:
            bill_id = rec['billtype'] + str(int(rec['billnumber']))
            bill = bill_dict.get(bill_id)
            if bill:
                bill.setdefault('subjects', []).append(rec['subjectkey'])
            else:
                self.warning('invalid bill id in BILLSUBJ.DBF: %s' % bill_id)

        # save all bills at the end
        for bill in bill_dict.itervalues():
            # add sources
            bill.add_source(bill_sponsors_url)
            bill.add_source(bill_document_url)
            bill.add_source(bill_action_url)
            bill.add_source(subject_url)
            self.save_bill(bill)
Exemple #13
0
    def get_bill_info(self, chamber, session, bill_detail_url,
                      version_list_url):
        """
        Extracts all the requested info for a given bill.

        Calls the parent's methods to enter the results into JSON files.
        """
        chamber = 'lower' if chamber.lower() == 'house' else chamber
        chamber = 'upper' if chamber.lower() == 'senate' else chamber

        # Get html and parse
        doc = self.lxmlize(bill_detail_url)

        # Check if bill hasn't been transmitted to the other chamber yet
        transmit_check = self.get_node(
            doc,
            '//h1[text()[contains(.,"Bills")]]/following-sibling::ul/li/text()'
        )
        if transmit_check is not None and 'has not been transmitted' in transmit_check.strip(
        ):
            self.logger.debug(
                'Bill has not been transmitted to other chamber ... skipping {0}'
                .format(bill_detail_url))
            return

        # Get the basic parts of the bill
        bill_id = self.get_node(doc, '//h1/text()')
        self.logger.debug(bill_id)
        bill_title_text = self.get_node(
            doc, '//h2[text()[contains(.,'
            '"Description")]]/following-sibling::p/text()')
        if bill_title_text is not None:
            bill_title = bill_title_text.strip()
        else:
            long_desc_url = self.get_node(
                doc, '//a[text()[contains(.,'
                '"Long Description")]]/@href')
            long_desc_page = self.lxmlize(long_desc_url)
            long_desc_text = self.get_node(
                long_desc_page, '//h1/'
                'following-sibling::p/text()')
            if long_desc_text is not None:
                bill_title = long_desc_text.strip()
            else:
                bill_title = 'No title found.'
                self.logger.warning('No title found for {}.'.format(bill_id))
        self.logger.debug(bill_title)
        bill_type = {
            'F': 'bill',
            'R': 'resolution',
            'C': 'concurrent resolution'
        }[bill_id[1]]
        bill = Bill(session, chamber, bill_id, bill_title, type=bill_type)

        # Add source
        bill.add_source(bill_detail_url)

        # Add subjects.  Currently we are not mapping to Open States
        # standardized subjects, so use 'scraped_subjects'
        bill['scraped_subjects'] = self._subject_mapping[bill_id]

        # Get companion bill.
        companion = doc.xpath(
            '//table[@class="status_info"]//tr[1]/td[2]/a[starts-with(@href, "?")]/text()'
        )
        companion = self.make_bill_id(
            companion[0]) if len(companion) > 0 else None
        companion_chamber = self.chamber_from_bill(companion)
        if companion is not None:
            bill.add_companion(companion, chamber=companion_chamber)

        # Grab sponsors
        bill = self.extract_sponsors(bill, doc, chamber)

        # Add Actions performed on the bill.
        bill = self.extract_actions(bill, doc, chamber)

        # Get all versions of the bill.
        bill = self.extract_versions(bill, doc, chamber, version_list_url)

        self.save_bill(bill)
Exemple #14
0
    def scrape_bill_pages(self, session, year_abr):
        """ assemble information on a bill from a number of DBF files
        """

        #Main Bill information
        main_bill_url, main_bill_db = self.get_dbf(year_abr, 'MAINBILL')

        # keep a dictionary of bills (mapping bill_id to Bill obj)
        bill_dict = {}

        for rec in main_bill_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["synopsis"]
            if bill_type[0] == 'A':
                chamber = "lower"
            else:
                chamber = "upper"

            # some bills have a blank title.. just skip it
            if not title:
                continue

            bill = Bill(str(session),
                        chamber,
                        bill_id,
                        title,
                        type=self._bill_types[bill_type[1:]])
            if rec['identicalb']:
                bill.add_companion(rec['identicalb'].split()[0])
            # TODO: last session info is in there too
            bill.add_source(main_bill_url)
            bill_dict[bill_id] = bill

        #Sponsors
        bill_sponsors_url, bill_sponsors_db = self.get_dbf(
            year_abr, 'BILLSPON')

        for rec in bill_sponsors_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            name = rec["sponsor"]
            sponsor_type = rec["type"]
            if sponsor_type == 'P':
                sponsor_type = "primary"
            else:
                sponsor_type = "cosponsor"
            bill.add_sponsor(sponsor_type, name)

        #Documents
        bill_document_url, bill_document_db = self.get_dbf(year_abr, 'BILLWP')

        #print bill_document_db[2]
        for rec in bill_document_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in document database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            document = rec["document"]
            document = document.split('\\')
            document = document[-2] + "/" + document[-1]
            year = str(year_abr) + str((year_abr + 1))

            #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document)
            htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (
                year_abr, document.replace('.DOC', '.HTM'))

            # name document based _doctype
            try:
                doc_name = self._doctypes[rec['doctype']]
            except KeyError:
                raise Exception('unknown doctype %s on %s' %
                                (rec['doctype'], bill_id))
            if rec['comment']:
                doc_name += ' ' + rec['comment']

            if rec['doctype'] in self._version_types:
                if htm_url.endswith('HTM'):
                    mimetype = 'text/html'
                elif htm_url.endswith('wpd'):
                    mimetype = 'application/vnd.wordperfect'
                bill.add_version(doc_name, htm_url, mimetype=mimetype)
            else:
                bill.add_document(doc_name, htm_url)

        # Votes
        next_year = int(year_abr) + 1
        vote_info_list = [
            'A%s' % year_abr,
            'A%s' % next_year,
            'S%s' % year_abr,
            'S%s' % next_year,
            'CA%s-%s' % (year_abr, next_year),
            'CS%s-%s' % (year_abr, next_year),
        ]

        for filename in vote_info_list:
            s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename
            try:
                s_vote_zip, resp = self.urlretrieve(s_vote_url)
            except scrapelib.FTPError:
                self.warning('could not find %s' % s_vote_url)
                continue
            zipedfile = zipfile.ZipFile(s_vote_zip)
            vfile = "%s.txt" % filename
            vote_file = zipedfile.open(vfile, 'U')
            vdict_file = csv.DictReader(vote_file)

            votes = {}
            if filename.startswith('A') or filename.startswith('CA'):
                chamber = "lower"
            else:
                chamber = "upper"

            if filename.startswith('C'):
                vote_file_type = 'committee'
            else:
                vote_file_type = 'chamber'

            for rec in vdict_file:

                if vote_file_type == 'chamber':
                    bill_id = rec["Bill"].strip()
                    leg = rec["Full_Name"]

                    date = rec["Session_Date"]
                    action = rec["Action"]
                    leg_vote = rec["Legislator_Vote"]
                else:
                    bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number'])
                    leg = rec['Name']
                    # drop time portion
                    date = rec['Agenda_Date'].split()[0]
                    # make motion readable
                    action = self._com_vote_motions[rec['BillAction']]
                    # first char (Y/N) use [0:1] to ignore ''
                    leg_vote = rec['LegislatorVote'][0:1]

                date = datetime.strptime(date, "%m/%d/%Y")
                vote_id = '_'.join((bill_id, chamber, action))
                vote_id = vote_id.replace(" ", "_")

                if vote_id not in votes:
                    votes[vote_id] = Vote(chamber,
                                          date,
                                          action,
                                          None,
                                          None,
                                          None,
                                          None,
                                          bill_id=bill_id)
                if vote_file_type == 'committee':
                    votes[vote_id]['committee'] = self._committees[
                        rec['Committee_House']]

                if leg_vote == "Y":
                    votes[vote_id].yes(leg)
                elif leg_vote == "N":
                    votes[vote_id].no(leg)
                else:
                    votes[vote_id].other(leg)

            # remove temp file
            os.remove(s_vote_zip)

            #Counts yes/no/other votes and saves overall vote
            for vote in votes.itervalues():
                vote_yes_count = len(vote["yes_votes"])
                vote_no_count = len(vote["no_votes"])
                vote_other_count = len(vote["other_votes"])
                vote["yes_count"] = vote_yes_count
                vote["no_count"] = vote_no_count
                vote["other_count"] = vote_other_count
                if vote_yes_count > vote_no_count:
                    vote["passed"] = True
                else:
                    vote["passed"] = False
                vote_bill_id = vote["bill_id"]
                bill = bill_dict[vote_bill_id]
                bill.add_vote(vote)

        #Actions
        bill_action_url, bill_action_db = self.get_dbf(year_abr, 'BILLHIST')
        actor_map = {'A': 'lower', 'G': 'executive', 'S': 'upper'}

        for rec in bill_action_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            action = rec["action"]
            date = rec["dateaction"]
            actor = actor_map[rec["house"]]
            comment = rec["comment"]
            action, atype = self.categorize_action(action, bill_id)
            if comment:
                action += (' ' + comment)
            bill.add_action(actor, action, date, type=atype)

        # Subjects
        subject_url, subject_db = self.get_dbf(year_abr, 'BILLSUBJ')
        for rec in subject_db:
            bill_id = rec['billtype'] + str(int(rec['billnumber']))
            bill = bill_dict.get(bill_id)
            if bill:
                bill.setdefault('subjects', []).append(rec['subjectkey'])
            else:
                self.warning('invalid bill id in BILLSUBJ.DBF: %s' % bill_id)

        # save all bills at the end
        for bill in bill_dict.itervalues():
            # add sources
            bill.add_source(bill_sponsors_url)
            bill.add_source(bill_document_url)
            bill.add_source(bill_action_url)
            bill.add_source(subject_url)
            self.save_bill(bill)
Exemple #15
0
class AssemblyBillPage(object):
    '''Get the actions, sponsors, sponsors memo and summary
    and assembly floor votes from the assembly page.
    '''

    metadata = metadata('ny')

    def __init__(self, scraper, session, chamber, url, doc, bill_type,
                 bill_id, title, bill_id_parts):
        self.scraper = scraper
        self.session = session
        self.term = term_for_session('ny', session)
        for data in self.metadata['terms']:
            if session in data['sessions']:
                self.termdata = data
            self.term_start_year = data['start_year']
        self.chamber = chamber
        self.url = url
        self.doc = doc
        self.bill_id = bill_id
        self.letter, self.number, self.version = bill_id_parts
        self.data = {}
        self.bill = Bill(session, chamber, bill_id, title, type=bill_type)
        self.succeeded = False

        self._build()

    def _build(self):
        if not self.doc.xpath('//pre/text()'):
            return
        self.get_actions()
        self.get_sponsors_memo()
        self.get_sponsors()
        self.get_summary()
        self.get_companions()
        self.get_lower_votes()
        self.get_version()
        self.succeeded = True
        self.bill.add_source(self.url)

    def _get_chunks(self):
        if 'summary' not in self.data:
            url = ('http://assembly.state.ny.us/leg/?default_fld=&'
                   'bn=%s&Summary=Y&Actions=Y&term=%s')
            url = url % (self.bill_id, self.term_start_year)
            doc = self.url2lxml(url)
            summary, actions = doc.xpath('//pre')[:2]
            summary = summary.text_content()
            actions = actions.text_content()
            self.data['summary'] = summary
            self.data['actions'] = actions
            return summary, actions
        else:
            return self.data['summary'], self.data['actions']

    def url2lxml(self, url):
        self.bill.add_source(url)
        return self.scraper.url2lxml(url)

    def get_version(self):
        url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn=%s&term=%s'
        url = url % (self.bill_id, self.term_start_year)
        version = self.bill_id
        self.bill.add_version(version, url, mimetype='text/html')

    def get_companions(self):
        summary, _ = self._get_chunks()
        chunks = summary.split('\n\n')
        for chunk in chunks:
            if chunk.startswith('SAME AS'):
                companions = chunk.replace('SAME AS    ', '')
                if companions != 'No same as':
                    for companion in re.split(r'\s*[\,\\]\s*', companions):
                        companion = re.sub(r'^Same as ', '', companion)
                        companion = re.sub(r'^Uni', '', companion)
                        companion = re.sub(r'\-\w+$', '', companion)
                        self.bill.add_companion(companion)

    def get_sponsors_memo(self):
        if self.chamber == 'lower':
            url = ('http://assembly.state.ny.us/leg/?'
                   'default_fld=&bn=%s&term=%s&Memo=Y')
            url = url % (self.bill_id, self.term_start_year)
            self.bill.add_document("Sponsor's Memorandum", url)

    def get_summary(self):
        summary, _ = self._get_chunks()
        chunks = summary.split('\n\n')
        self.bill['summary'] = chunks[-1]

    def _scrub_name(self, name):
        junk = [
            r'^Rules\s+',
            '\(2nd Vice Chairperson\)',
            '\(MS\)',
            'Assemblyman',
            'Assemblywoman',
            'Senator']
        for rgx in junk:
            name = re.sub(rgx, '', name, re.I)

        # Collabpse whitespace.
        name = re.sub('\s+', ' ', name)
        return name.strip('(), ')

    def get_sponsors(self):
        summary, _ = self._get_chunks()
        chunks = summary.split('\n\n')
        for chunk in chunks:
            for sponsor_type in ('SPONSOR', 'COSPNSR', 'MLTSPNSR'):
                if chunk.startswith(sponsor_type):
                    _, data = chunk.split(' ', 1)
                    for sponsor in re.split(r',\s+', data.strip()):

                        if not sponsor:
                            continue

                        # If it's a "Rules" bill, add the Rules committee
                        # as the primary.
                        if sponsor.startswith('Rules'):
                            self.bill.add_sponsor('primary', 'Rules Committee',
                                                  chamber='lower')

                        sponsor = self._scrub_name(sponsor)

                        # Figure out sponsor type.
                        spons_swap = {'SPONSOR': 'primary'}
                        _sponsor_type = spons_swap.get(
                            sponsor_type, 'cosponsor')

                        self.bill.add_sponsor(_sponsor_type, sponsor.strip(),
                                         official_type=sponsor_type)

    def get_actions(self):
        _, actions = self._get_chunks()
        categorizer = self.scraper.categorizer
        actions_rgx = r'(\d{2}/\d{2}/\d{4})\s+(.+)'
        actions_data = re.findall(actions_rgx, actions)
        for date, action in actions_data:
            date = datetime.datetime.strptime(date, r'%m/%d/%Y')
            act_chamber = ('upper' if action.isupper() else 'lower')
            types, attrs = categorizer.categorize(action)
            self.bill.add_action(act_chamber, action, date, type=types, **attrs)
            # Bail if the bill has been substituted by another.
            if 'substituted by' in action:
                return

    def get_lower_votes(self):

        url = ('http://assembly.state.ny.us/leg/?'
               'default_fld=&bn=%s&term=%s&Votes=Y')
        url = url % (self.bill_id, self.term_start_year)
        doc = self.url2lxml(url)
        if doc is None:
            return

        pre = doc.xpath('//pre')[0].text_content()
        no_votes = ('There are no votes for this bill in this '
                    'legislative session.')
        if pre == no_votes:
            return

        actual_vote = collections.defaultdict(list)
        for table in doc.xpath('//table'):

            date = table.xpath('caption/label[contains(., "DATE:")]')
            date = date[0].itersiblings().next().text
            date = datetime.datetime.strptime(date, '%m/%d/%Y')

            votes = table.xpath('caption/span/label[contains(., "YEA/NAY:")]')
            votes = votes[0].itersiblings().next().text
            yes_count, no_count = map(int, votes.split('/'))

            passed = yes_count > no_count
            vote = Vote('lower', date, 'Floor Vote', passed, yes_count,
                        no_count, other_count=0)

            tds = table.xpath('tr/td/text()')
            votes = iter(tds)
            while True:
                try:
                    data = list(islice(votes, 2))
                    name, vote_val = data
                except (StopIteration, ValueError):
                    # End of data. Stop.
                    break
                name = self._scrub_name(name)

                if vote_val.strip() == 'Y':
                    vote.yes(name)
                elif vote_val.strip() in ('N', 'NO'):
                    vote.no(name)
                else:
                    vote.other(name)
                    actual_vote[vote_val].append(name)

            # The page doesn't provide an other_count.
            vote['other_count'] = len(vote['other_votes'])
            vote['actual_vote'] = actual_vote
            self.bill.add_vote(vote)
Exemple #16
0
    def scrape_bill_pages(self, session, year_abr):
        """ assemble information on a bill from a number of DBF files
        """

        # Main Bill information
        main_bill_url, main_bill_db = self.get_dbf(year_abr, "MAINBILL")

        # keep a dictionary of bills (mapping bill_id to Bill obj)
        bill_dict = {}

        for rec in main_bill_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["synopsis"]
            if bill_type[0] == "A":
                chamber = "lower"
            else:
                chamber = "upper"

            # some bills have a blank title.. just skip it
            if not title:
                continue

            bill = Bill(str(session), chamber, bill_id, title, type=self._bill_types[bill_type[1:]])
            if rec["identicalb"]:
                bill.add_companion(rec["identicalb"].split()[0])
            # TODO: last session info is in there too
            bill.add_source(main_bill_url)
            bill_dict[bill_id] = bill

        # Sponsors
        bill_sponsors_url, bill_sponsors_db = self.get_dbf(year_abr, "BILLSPON")

        for rec in bill_sponsors_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            name = rec["sponsor"]
            sponsor_type = rec["type"]
            if sponsor_type == "P":
                sponsor_type = "primary"
            else:
                sponsor_type = "cosponsor"
            bill.add_sponsor(sponsor_type, name)

        # Documents
        bill_document_url, bill_document_db = self.get_dbf(year_abr, "BILLWP")

        # print bill_document_db[2]
        for rec in bill_document_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning("unknown bill %s in document database" % bill_id)
                continue
            bill = bill_dict[bill_id]
            document = rec["document"]
            document = document.split("\\")
            document = document[-2] + "/" + document[-1]
            year = str(year_abr) + str((year_abr + 1))

            # doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document)
            htm_url = "http://www.njleg.state.nj.us/%s/Bills/%s" % (year_abr, document.replace(".DOC", ".HTM"))

            # name document based _doctype
            try:
                doc_name = self._doctypes[rec["doctype"]]
            except KeyError:
                raise Exception("unknown doctype %s on %s" % (rec["doctype"], bill_id))
            if rec["comment"]:
                doc_name += " " + rec["comment"]

            if rec["doctype"] in self._version_types:
                if htm_url.endswith("HTM"):
                    mimetype = "text/html"
                elif htm_url.endswith("wpd"):
                    mimetype = "application/vnd.wordperfect"
                bill.add_version(doc_name, htm_url, mimetype=mimetype)
            else:
                bill.add_document(doc_name, htm_url)

        # Votes
        next_year = int(year_abr) + 1
        vote_info_list = [
            "A%s" % year_abr,
            "A%s" % next_year,
            "S%s" % year_abr,
            "S%s" % next_year,
            "CA%s-%s" % (year_abr, next_year),
            "CS%s-%s" % (year_abr, next_year),
        ]

        for filename in vote_info_list:
            s_vote_url = "ftp://www.njleg.state.nj.us/votes/%s.zip" % filename
            try:
                s_vote_zip, resp = self.urlretrieve(s_vote_url)
            except scrapelib.FTPError:
                self.warning("could not find %s" % s_vote_url)
                continue
            zipedfile = zipfile.ZipFile(s_vote_zip)
            vfile = "%s.txt" % filename
            vote_file = zipedfile.open(vfile, "U")
            vdict_file = csv.DictReader(vote_file)

            votes = {}
            if filename.startswith("A") or filename.startswith("CA"):
                chamber = "lower"
            else:
                chamber = "upper"

            if filename.startswith("C"):
                vote_file_type = "committee"
            else:
                vote_file_type = "chamber"

            for rec in vdict_file:

                if vote_file_type == "chamber":
                    bill_id = rec["Bill"].strip()
                    leg = rec["Full_Name"]

                    date = rec["Session_Date"]
                    action = rec["Action"]
                    leg_vote = rec["Legislator_Vote"]
                else:
                    bill_id = "%s%s" % (rec["Bill_Type"], rec["Bill_Number"])
                    leg = rec["Name"]
                    # drop time portion
                    date = rec["Agenda_Date"].split()[0]
                    # make motion readable
                    action = self._com_vote_motions[rec["BillAction"]]
                    # first char (Y/N) use [0:1] to ignore ''
                    leg_vote = rec["LegislatorVote"][0:1]

                date = datetime.strptime(date, "%m/%d/%Y")
                vote_id = "_".join((bill_id, chamber, action))
                vote_id = vote_id.replace(" ", "_")

                if vote_id not in votes:
                    votes[vote_id] = Vote(chamber, date, action, None, None, None, None, bill_id=bill_id)
                if vote_file_type == "committee":
                    votes[vote_id]["committee"] = self._committees[rec["Committee_House"]]

                if leg_vote == "Y":
                    votes[vote_id].yes(leg)
                elif leg_vote == "N":
                    votes[vote_id].no(leg)
                else:
                    votes[vote_id].other(leg)

            # remove temp file
            os.remove(s_vote_zip)

            # Counts yes/no/other votes and saves overall vote
            for vote in votes.itervalues():
                vote_yes_count = len(vote["yes_votes"])
                vote_no_count = len(vote["no_votes"])
                vote_other_count = len(vote["other_votes"])
                vote["yes_count"] = vote_yes_count
                vote["no_count"] = vote_no_count
                vote["other_count"] = vote_other_count
                if vote_yes_count > vote_no_count:
                    vote["passed"] = True
                else:
                    vote["passed"] = False
                vote_bill_id = vote["bill_id"]
                bill = bill_dict[vote_bill_id]
                bill.add_vote(vote)

        # Actions
        bill_action_url, bill_action_db = self.get_dbf(year_abr, "BILLHIST")
        actor_map = {"A": "lower", "G": "executive", "S": "upper"}

        for rec in bill_action_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            action = rec["action"]
            date = rec["dateaction"]
            actor = actor_map[rec["house"]]
            comment = rec["comment"]
            action, atype = self.categorize_action(action, bill_id)
            if comment:
                action += " " + comment
            bill.add_action(actor, action, date, type=atype)

        # Subjects
        subject_url, subject_db = self.get_dbf(year_abr, "BILLSUBJ")
        for rec in subject_db:
            bill_id = rec["billtype"] + str(int(rec["billnumber"]))
            bill = bill_dict.get(bill_id)
            if bill:
                bill.setdefault("subjects", []).append(rec["subjectkey"])
            else:
                self.warning("invalid bill id in BILLSUBJ.DBF: %s" % bill_id)

        phony_bill_count = 0
        # save all bills at the end
        for bill in bill_dict.itervalues():
            # add sources
            bill.add_source(bill_sponsors_url)
            bill.add_source(bill_document_url)
            bill.add_source(bill_action_url)
            bill.add_source(subject_url)
            if not bill["actions"] and not bill["versions"]:
                self.warning("probable phony bill detected %s", bill["bill_id"])
                phony_bill_count += 1
            else:
                self.save_bill(bill)

        if phony_bill_count:
            self.warning("%s total phony bills detected", phony_bill_count)
Exemple #17
0
    def scrape_bills(self, session, year_abr):
        #Main Bill information
        main_bill_csv = self.access_to_csv('MainBill')

        # keep a dictionary of bills (mapping bill_id to Bill obj)
        bill_dict = {}

        for rec in main_bill_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["Synopsis"]
            if bill_type[0] == 'A':
                chamber = "lower"
            else:
                chamber = "upper"

            # some bills have a blank title.. just skip it
            if not title:
                continue

            bill = Bill(str(session), chamber, bill_id, title,
                        type=self._bill_types[bill_type[1:]])
            if rec['IdenticalBillNumber'].strip():
                bill.add_companion(rec['IdenticalBillNumber'].split()[0])

            # TODO: last session info is in there too
            bill_dict[bill_id] = bill

        #Sponsors
        bill_sponsors_csv = self.access_to_csv('BillSpon')

        for rec in bill_sponsors_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in sponsor database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            name = rec["Sponsor"]
            sponsor_type = rec["Type"]
            if sponsor_type == 'P':
                sponsor_type = "primary"
            else:
                sponsor_type = "cosponsor"
            bill.add_sponsor(sponsor_type, name)


        #Documents
        bill_document_csv = self.access_to_csv('BillWP')

        for rec in bill_document_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in document database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            document = rec["Document"]
            document = document.split('\\')
            document = document[-2] + "/" + document[-1]
            year = str(year_abr) + str((year_abr + 1))

            #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document)
            htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (year_abr,
                document.replace('.DOC', '.HTM'))

            # name document based _doctype
            try:
                doc_name = self._doctypes[rec['DocType']]
            except KeyError:
                raise Exception('unknown doctype %s on %s' %
                                (rec['DocType'], bill_id))
            if rec['Comment']:
                doc_name += ' ' + rec['Comment']

            # Clean HTMX links.
            if htm_url.endswith('HTMX'):
                htm_url = re.sub('X$', '', htm_url)

            if rec['DocType'] in self._version_types:
                if htm_url.endswith('HTM'):
                    mimetype = 'text/html'
                elif htm_url.endswith('wpd'):
                    mimetype = 'application/vnd.wordperfect'
                try:
                    bill.add_version(doc_name, htm_url, mimetype=mimetype)
                except ValueError:
                    self.warning("Couldn't find a document for bill {}".format(bill_id))
                    pass
            else:
                bill.add_document(doc_name, htm_url)

        # Votes
        next_year = int(year_abr)+1
        vote_info_list = ['A%s' % year_abr,
                          'A%s' % next_year,
                          'S%s' % year_abr,
                          'S%s' % next_year,
                          'CA%s-%s' % (year_abr, next_year),
                          'CS%s-%s' % (year_abr, next_year),
                         ]

        for filename in vote_info_list:
            s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename
            try:
                s_vote_zip, resp = self.urlretrieve(s_vote_url)
            except scrapelib.FTPError:
                self.warning('could not find %s' % s_vote_url)
                continue
            zipedfile = zipfile.ZipFile(s_vote_zip)
            for vfile in ["%s.txt" % (filename), "%sEnd.txt" % (filename)]:
                try:
                    vote_file = zipedfile.open(vfile, 'U')
                except KeyError:
                    #
                    # Right, so, 2011 we have an "End" file with more
                    # vote data than was in the original dump.
                    #
                    self.warning("No such file: %s" % (vfile))
                    continue

                vdict_file = csv.DictReader(vote_file)

                votes = {}
                if filename.startswith('A') or filename.startswith('CA'):
                    chamber = "lower"
                else:
                    chamber = "upper"

                if filename.startswith('C'):
                    vote_file_type = 'committee'
                else:
                    vote_file_type = 'chamber'

                for rec in vdict_file:

                    if vote_file_type == 'chamber':
                        bill_id = rec["Bill"].strip()
                        leg = rec["Full_Name"]

                        date = rec["Session_Date"]
                        action = rec["Action"]
                        leg_vote = rec["Legislator_Vote"]
                    else:
                        bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number'])
                        leg = rec['Name']
                        # drop time portion
                        date = rec['Agenda_Date'].split()[0]
                        # make motion readable
                        action = self._com_vote_motions[rec['BillAction']]
                        # first char (Y/N) use [0:1] to ignore ''
                        leg_vote = rec['LegislatorVote'][0:1]

                    date = datetime.strptime(date, "%m/%d/%Y")
                    vote_id = '_'.join((bill_id, chamber, action))
                    vote_id = vote_id.replace(" ", "_")

                    if vote_id not in votes:
                        votes[vote_id] = Vote(chamber, date, action, None, None,
                                              None, None, bill_id=bill_id)
                    if vote_file_type == 'committee':
                        votes[vote_id]['committee'] = self._committees[
                            rec['Committee_House']]

                    if leg_vote == "Y":
                        votes[vote_id].yes(leg)
                    elif leg_vote == "N":
                        votes[vote_id].no(leg)
                    else:
                        votes[vote_id].other(leg)

            # remove temp file
            os.remove(s_vote_zip)

            #Counts yes/no/other votes and saves overall vote
            for vote in votes.itervalues():
                vote_yes_count = len(vote["yes_votes"])
                vote_no_count = len(vote["no_votes"])
                vote_other_count = len(vote["other_votes"])
                vote["yes_count"] = vote_yes_count
                vote["no_count"] = vote_no_count
                vote["other_count"] = vote_other_count

                # Veto override.
                if vote['motion'] == 'OVERRIDE':
                    # Per the NJ leg's glossary, a veto override requires
                    # 2/3ds of each chamber. 27 in the senate, 54 in the house.
                    # http://www.njleg.state.nj.us/legislativepub/glossary.asp
                    vote['passed'] = False
                    if vote['chamber'] == 'lower':
                        if vote_yes_count >= 54:
                            vote['passed'] = True
                    elif vote['chamber'] == 'upper':
                        if vote_yes_count >= 27:
                            vote['passed'] = True

                # Regular vote.
                elif vote_yes_count > vote_no_count:
                    vote["passed"] = True
                else:
                    vote["passed"] = False
                vote_bill_id = vote["bill_id"]
                bill = bill_dict[vote_bill_id]
                bill.add_vote(vote)

        #Actions
        bill_action_csv = self.access_to_csv('BillHist')
        actor_map = {'A': 'lower', 'G': 'executive', 'S': 'upper'}

        for rec in bill_action_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in action database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            action = rec["Action"]
            date = rec["DateAction"]
            date = datetime.strptime(date, "%m/%d/%y %H:%M:%S")
            actor = actor_map[rec["House"]]
            comment = rec["Comment"]
            action, atype = self.categorize_action(action, bill_id)
            if comment:
                action += (' ' + comment)
            bill.add_action(actor, action, date, type=atype)

        # Subjects
        subject_csv = self.access_to_csv('BillSubj')
        for rec in subject_csv:
            bill_id = rec['BillType'].strip() + str(int(rec['BillNumber']))
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in subject database' % bill_id)
                continue
            bill = bill_dict.get(bill_id)
            if bill:
                bill.setdefault('subjects', []).append(rec['SubjectKey'])
            else:
                self.warning('invalid bill id in BillSubj: %s' % bill_id)

        phony_bill_count = 0
        # save all bills at the end
        for bill in bill_dict.itervalues():
            # add sources
            if not bill['actions'] and not bill['versions']:
                self.warning('probable phony bill detected %s',
                             bill['bill_id'])
                phony_bill_count += 1
            else:
                bill.add_source('http://www.njleg.state.nj.us/downloads.asp')
                self.save_bill(bill)

        if phony_bill_count:
            self.warning('%s total phony bills detected', phony_bill_count)
Exemple #18
0
class AssemblyBillPage(object):
    """Get the actions, sponsors, sponsors memo and summary
    and assembly floor votes from the assembly page.
    """

    metadata = metadata("ny")

    def __init__(self, scraper, session, chamber, url, doc, bill_type, bill_id, title, bill_id_parts):
        self.scraper = scraper
        self.session = session
        self.term = term_for_session("ny", session)
        for data in self.metadata["terms"]:
            if session in data["sessions"]:
                self.termdata = data
            self.term_start_year = data["start_year"]
        self.chamber = chamber
        self.url = url
        self.doc = doc
        self.bill_id = bill_id
        self.letter, self.number, self.version = bill_id_parts
        self.data = {}
        self.bill = Bill(session, chamber, bill_id, title, type=bill_type)
        self.succeeded = False

        self._build()

    def _build(self):
        if not self.doc.xpath("//pre/text()"):
            return
        self.get_actions()
        self.get_sponsors_memo()
        self.get_sponsors()
        self.get_summary()
        self.get_companions()
        self.get_lower_votes()
        self.get_version()
        self.succeeded = True
        self.bill.add_source(self.url)

    def _get_chunks(self):
        if "summary" not in self.data:
            url = "http://assembly.state.ny.us/leg/?default_fld=&" "bn=%s&Summary=Y&Actions=Y&term=%s"
            url = url % (self.bill_id, self.term_start_year)
            doc = self.url2lxml(url)
            summary, actions = doc.xpath("//pre")[:2]
            summary = summary.text_content()
            actions = actions.text_content()
            self.data["summary"] = summary
            self.data["actions"] = actions
            return summary, actions
        else:
            return self.data["summary"], self.data["actions"]

    def url2lxml(self, url):
        self.bill.add_source(url)
        return self.scraper.url2lxml(url)

    def get_version(self):
        url = "http://assembly.state.ny.us/leg/?sh=printbill&bn=%s&term=%s"
        url = url % (self.bill_id, self.term_start_year)
        version = self.bill_id
        self.bill.add_version(version, url, mimetype="text/html")

    def get_companions(self):
        summary, _ = self._get_chunks()
        chunks = summary.split("\n\n")
        for chunk in chunks:
            if chunk.startswith("SAME AS"):
                companions = chunk.replace("SAME AS    ", "")
                if companions != "No same as":
                    for companion in re.split(r"\s*[\,\\]\s*", companions):
                        companion = re.sub(r"^Same as ", "", companion)
                        companion = re.sub(r"^Uni", "", companion)
                        companion = re.sub(r"\-\w+$", "", companion)
                        self.bill.add_companion(companion)

    def get_sponsors_memo(self):
        if self.chamber == "lower":
            url = "http://assembly.state.ny.us/leg/?" "default_fld=&bn=%s&term=%s&Memo=Y"
            url = url % (self.bill_id, self.term_start_year)
            self.bill.add_document("Sponsor's Memorandum", url)

    def get_summary(self):
        summary, _ = self._get_chunks()
        chunks = summary.split("\n\n")
        self.bill["summary"] = chunks[-1]

    def _scrub_name(self, name):
        junk = [r"^Rules\s+", "\(2nd Vice Chairperson\)", "\(MS\)", "Assemblyman", "Assemblywoman", "Senator"]
        for rgx in junk:
            name = re.sub(rgx, "", name, re.I)

        # Collabpse whitespace.
        name = re.sub("\s+", " ", name)
        return name.strip("(), ")

    def get_sponsors(self):
        summary, _ = self._get_chunks()
        chunks = summary.split("\n\n")
        for chunk in chunks:
            for sponsor_type in ("SPONSOR", "COSPNSR", "MLTSPNSR"):
                if chunk.startswith(sponsor_type):
                    _, data = chunk.split(" ", 1)
                    for sponsor in re.split(r",\s+", data.strip()):

                        if not sponsor:
                            continue

                        # If it's a "Rules" bill, add the Rules committee
                        # as the primary.
                        if sponsor.startswith("Rules"):
                            self.bill.add_sponsor("primary", "Rules Committee", chamber="lower")

                        sponsor = self._scrub_name(sponsor)

                        # Figure out sponsor type.
                        spons_swap = {"SPONSOR": "primary"}
                        _sponsor_type = spons_swap.get(sponsor_type, "cosponsor")

                        self.bill.add_sponsor(_sponsor_type, sponsor.strip(), official_type=sponsor_type)

    def get_actions(self):
        _, actions = self._get_chunks()
        categorizer = self.scraper.categorizer
        actions_rgx = r"(\d{2}/\d{2}/\d{4})\s+(.+)"
        actions_data = re.findall(actions_rgx, actions)
        for date, action in actions_data:
            date = datetime.datetime.strptime(date, r"%m/%d/%Y")
            act_chamber = "upper" if action.isupper() else "lower"
            types, attrs = categorizer.categorize(action)
            self.bill.add_action(act_chamber, action, date, type=types, **attrs)
            # Bail if the bill has been substituted by another.
            if "substituted by" in action:
                return

    def get_lower_votes(self):

        url = "http://assembly.state.ny.us/leg/?" "default_fld=&bn=%s&term=%s&Votes=Y"
        url = url % (self.bill_id, self.term_start_year)
        doc = self.url2lxml(url)
        if doc is None:
            return

        pre = doc.xpath("//pre")[0].text_content()
        no_votes = "There are no votes for this bill in this " "legislative session."
        if pre == no_votes:
            return

        actual_vote = collections.defaultdict(list)
        for table in doc.xpath("//table"):

            date = table.xpath('caption/label[contains(., "DATE:")]')
            date = date[0].itersiblings().next().text
            date = datetime.datetime.strptime(date, "%m/%d/%Y")

            votes = table.xpath('caption/span/label[contains(., "YEA/NAY:")]')
            votes = votes[0].itersiblings().next().text
            yes_count, no_count = map(int, votes.split("/"))

            passed = yes_count > no_count
            vote = Vote("lower", date, "Floor Vote", passed, yes_count, no_count, other_count=0)

            tds = table.xpath("tr/td/text()")
            votes = iter(tds)
            while True:
                try:
                    data = list(islice(votes, 2))
                    name, vote_val = data
                except (StopIteration, ValueError):
                    # End of data. Stop.
                    break
                name = self._scrub_name(name)

                if vote_val.strip() == "Y":
                    vote.yes(name)
                elif vote_val.strip() in ("N", "NO"):
                    vote.no(name)
                else:
                    vote.other(name)
                    actual_vote[vote_val].append(name)

            # The page doesn't provide an other_count.
            vote["other_count"] = len(vote["other_votes"])
            vote["actual_vote"] = actual_vote
            self.bill.add_vote(vote)