Exemple #1
0
def scan_votes(abbr):
    sessions = defaultdict(_vote_report_dict)

    # load exception data into sets of ids indexed by exception type
    quality_exceptions = get_quality_exceptions(abbr)

    for vote in db.votes.find({settings.LEVEL_FIELD: abbr}):
        session_d = sessions[vote['session']]

        session_d['vote_count'] += 1
        if vote['passed']:
            session_d['_passed_vote_count'] += 1
        session_d['votes_per_chamber'][vote['chamber']] += 1
        if not vote.get('type'):
            logger.warning('vote %s missing type' % vote['_id'])
            continue
        session_d['votes_per_type'][vote.get('type')] += 1
        if not vote.get('date'):
            logger.warning('vote %s missing date' % vote['_id'])
            continue
        session_d['votes_per_month'][vote['date'].strftime('%Y-%m')] += 1

        # roll calls
        has_rollcalls = False
        for rc in (vote['yes_votes'] + vote['no_votes'] +
                   vote['other_votes']):
            has_rollcalls = True
            session_d['_rollcall_count'] += 1
            if rc.get('leg_id'):
                session_d['_rollcalls_with_leg_id_count'] += 1
            else:
                # keep missing leg_ids
                session_d['unmatched_voters'].add(
                    (term_for_session(abbr, vote['session']),
                     vote['chamber'],
                    rc['name'])
                )

        # check counts if any rollcalls are present
        if has_rollcalls:
            if (len(vote['yes_votes']) != vote['yes_count'] and
                vote['vote_id'] not in
                quality_exceptions['votes:bad_yes_count']):
                session_d['bad_vote_counts'].add(vote['vote_id'])
            if (len(vote['no_votes']) != vote['no_count'] and
                vote['vote_id'] not in
                quality_exceptions['votes:bad_no_count']):
                session_d['bad_vote_counts'].add(vote['vote_id'])
            if (len(vote['other_votes']) != vote['other_count'] and
                vote['vote_id'] not in
                quality_exceptions['votes:bad_other_count']):
                session_d['bad_vote_counts'].add(vote['vote_id'])

    # do logging of unnecessary exceptions
    for qe_type, qes in quality_exceptions.iteritems():
        if qes:
            logger.warning('unnecessary {0} exceptions for {1} votes: \n  {2}'
                           .format(qe_type, len(qes), '\n  '.join(qes)))

    return {'sessions': sessions}
Exemple #2
0
def pagerank(state, session, chamber, d_factor=0.85):
    term = utils.term_for_session(state, session)

    leg_indexes = generate_leg_indexes(state, term, chamber)
    M = generate_adjacency_matrix(state, session, chamber,
                                  leg_indexes)
    size = len(leg_indexes)

    # Scale each column of the adjacency matrix
    for i in xrange(0, size):
        col_sum = M[:,i].sum()
        if col_sum:
            M[:,i] = M[:,i] / col_sum

    e = ((1.0 - d_factor) / size) * numpy.ones((size, size))

    result = numpy.ones(size) / size

    for i in xrange(0, 100):
        result = numpy.dot(d_factor * M + e, result)
        result /= result.sum()

    for leg_id in leg_indexes.keys():
        leg_indexes[leg_id] = result[leg_indexes[leg_id]]

    return leg_indexes
Exemple #3
0
    def __init__(self, scraper, session, chamber, details):
        (senate_url, assembly_url, bill_chamber, bill_type, bill_id, title,
         bill_id_parts) = details

        self.scraper = scraper
        self.session = session
        self.chamber = chamber
        self.data = {}
        self.bill = Bill(session, bill_chamber, bill_id, title, type=bill_type)

        self.term = term_for_session('ny', session)
        for data in self.metadata['terms']:
            if session in data['sessions']:
                self.termdata = data
            self.term_start_year = data['start_year']

        self.assembly_url = assembly_url
        self.senate_url = senate_url
        self.bill_chamber = bill_chamber
        self.bill_type = bill_type
        self.bill_id = bill_id
        self.title = title
        self.letter, self.number, self.version = bill_id_parts

        self.urls = Urls(scraper=self.scraper,
                         urls={
                             'assembly': assembly_url,
                             'senate': senate_url
                         })
Exemple #4
0
    def __init__(self, scraper, session, chamber, details):
        (senate_url, assembly_url, bill_chamber, bill_type, bill_id,
          title, bill_id_parts) = details

        self.scraper = scraper
        self.session = session
        self.chamber = chamber
        self.data = {}
        self.bill = Bill(session, bill_chamber, bill_id, title, type=bill_type)

        self.term = term_for_session('ny', session)
        for data in self.metadata['terms']:
            if session in data['sessions']:
                self.termdata = data
            self.term_start_year = data['start_year']

        self.assembly_url = assembly_url
        self.senate_url = senate_url
        self.bill_chamber = bill_chamber
        self.bill_type = bill_type
        self.bill_id = bill_id
        self.title = title
        self.letter, self.number, self.version = bill_id_parts

        self.urls = Urls(scraper=self.scraper, urls={
            'assembly': assembly_url,
            'senate': senate_url})
Exemple #5
0
 def scrape(self, session, chambers):
     term_id = term_for_session('ny', session)
     for term in self.metadata['terms']:
         if term['name'] == term_id:
             break
     self.term = term
     for billset in self.yield_grouped_versions():
         self.scrape_bill(session, billset)
Exemple #6
0
 def scrape(self, session, chambers):
     term_id = term_for_session("ny", session)
     for term in self.metadata["terms"]:
         if term["name"] == term_id:
             break
     self.term = term
     for billset in self.yield_grouped_versions():
         self.scrape_bill(session, billset)
Exemple #7
0
 def scrape(self, session, chambers):
     term_id = term_for_session('ny', session)
     for term in self.metadata['terms']:
         if term['name'] == term_id:
             break
     self.term = term
     for billset in self.yield_grouped_versions():
         self.scrape_bill(session, billset)
Exemple #8
0
def scan_votes(abbr):
    sessions = defaultdict(_vote_report_dict)

    # load exception data into sets of ids indexed by exception type
    quality_exceptions = get_quality_exceptions(abbr)

    for vote in db.votes.find({settings.LEVEL_FIELD: abbr}):
        session_d = sessions[vote['session']]

        session_d['vote_count'] += 1
        if vote['passed']:
            session_d['_passed_vote_count'] += 1
        session_d['votes_per_chamber'][vote['chamber']] += 1
        if not vote.get('type'):
            logger.warning('vote %s missing type' % vote['_id'])
            continue
        session_d['votes_per_type'][vote.get('type')] += 1
        if not vote.get('date'):
            logger.warning('vote %s missing date' % vote['_id'])
            continue
        session_d['votes_per_month'][vote['date'].strftime('%Y-%m')] += 1

        # roll calls
        has_rollcalls = False
        for rc in (vote['yes_votes'] + vote['no_votes'] + vote['other_votes']):
            has_rollcalls = True
            session_d['_rollcall_count'] += 1
            if rc.get('leg_id'):
                session_d['_rollcalls_with_leg_id_count'] += 1
            else:
                # keep missing leg_ids
                session_d['unmatched_voters'].add(
                    (term_for_session(abbr, vote['session']), vote['chamber'],
                     rc['name']))

        # check counts if any rollcalls are present
        if has_rollcalls:
            if (len(vote['yes_votes']) != vote['yes_count'] and vote['vote_id']
                    not in quality_exceptions['votes:bad_yes_count']):
                session_d['bad_vote_counts'].add(vote['vote_id'])
            if (len(vote['no_votes']) != vote['no_count'] and vote['vote_id']
                    not in quality_exceptions['votes:bad_no_count']):
                session_d['bad_vote_counts'].add(vote['vote_id'])
            if (len(vote['other_votes']) != vote['other_count']
                    and vote['vote_id']
                    not in quality_exceptions['votes:bad_other_count']):
                session_d['bad_vote_counts'].add(vote['vote_id'])

    # do logging of unnecessary exceptions
    for qe_type, qes in quality_exceptions.iteritems():
        if qes:
            logger.warning(
                'unnecessary {0} exceptions for {1} votes: \n  {2}'.format(
                    qe_type, len(qes), '\n  '.join(qes)))

    return {'sessions': sessions}
Exemple #9
0
def legislator_pagerank(abbr, session, chamber, d_factor=0.85):
    term = utils.term_for_session(abbr, session)
    leg_indexes = generate_leg_indexes(abbr, term, chamber)
    adjacency_matrix = generate_adjacency_matrix(abbr, session, chamber,
                                                 leg_indexes)
    result = pagerank(adjacency_matrix, d_factor)

    for leg_id in leg_indexes.keys():
        leg_indexes[leg_id] = result[leg_indexes[leg_id]]

    return leg_indexes
Exemple #10
0
def legislator_pagerank(state, session, chamber, d_factor=0.85):
    term = utils.term_for_session(state, session)
    leg_indexes = generate_leg_indexes(state, term, chamber)
    adjacency_matrix = generate_adjacency_matrix(state, session,
                                                 chamber, leg_indexes)
    result = pagerank(adjacency_matrix, d_factor)

    for leg_id in leg_indexes.keys():
        leg_indexes[leg_id] = result[leg_indexes[leg_id]]

    return leg_indexes
Exemple #11
0
    def scrape(self, session, chambers):
        self.api_client = OpenLegislationAPIClient(self)

        term_id = term_for_session('ny', session)

        for term in reversed(self.metadata['terms']):
            if term['name'] == term_id:
                self.term_start_year = term['start_year']
                break

        for bill in self._generate_bills(session):
            bill_object = self._scrape_bill(session, bill)
            self.save_bill(bill_object)
Exemple #12
0
    def scrape(self, session, chambers):
        self.api_client = OpenLegislationAPIClient(self)

        term_id = term_for_session('ny', session)

        for term in reversed(self.metadata['terms']):
            if term['name'] == term_id:
                self.term_start_year = term['start_year']
                break

        for bill in self._generate_bills(session):
            bill_object = self._scrape_bill(session, bill)
            self.save_bill(bill_object)
Exemple #13
0
    def __init__(self, scraper, session, bill, details):
        (senate_url, assembly_url, bill_chamber, bill_type, bill_id,
          title, bill_id_parts) = details

        self.bill = bill
        self.bill_id = bill_id
        # This works on the assumption that the metadata term ID is
        # only the start year.
        self.term_start_year = term_for_session('ny', session)
        self.letter, self.number, self.version = bill_id_parts
        self.shared_url = 'http://assembly.state.ny.us/leg/?default_fld='\
            '&bn={}&term={}'.format(self.bill_id, self.term_start_year)
        self.urls = Urls(scraper=scraper, urls={
            'assembly': assembly_url,
            'senate': senate_url})
Exemple #14
0
    def __init__(self, scraper, session, bill, details):
        (senate_url, assembly_url, bill_chamber, bill_type, bill_id, title,
         bill_id_parts) = details

        self.bill = bill
        self.bill_id = bill_id
        # This works on the assumption that the metadata term ID is
        # only the start year.
        self.term_start_year = term_for_session('ny', session)
        self.letter, self.number, self.version = bill_id_parts
        self.shared_url = 'http://assembly.state.ny.us/leg/?default_fld='\
            '&bn={}&term={}'.format(self.bill_id, self.term_start_year)
        self.urls = Urls(scraper=scraper,
                         urls={
                             'assembly': assembly_url,
                             'senate': senate_url
                         })
Exemple #15
0
    def __init__(self, scraper, session, chamber, url, doc, bill_type, bill_id, title, bill_id_parts):
        self.scraper = scraper
        self.session = session
        self.term = term_for_session("ny", session)
        for data in self.metadata["terms"]:
            if session in data["sessions"]:
                self.termdata = data
            self.term_start_year = data["start_year"]
        self.chamber = chamber
        self.url = url
        self.doc = doc
        self.bill_id = bill_id
        self.letter, self.number, self.version = bill_id_parts
        self.data = {}
        self.bill = Bill(session, chamber, bill_id, title, type=bill_type)
        self.succeeded = False

        self._build()
Exemple #16
0
    def __init__(self, scraper, session, chamber, url, doc, bill_type,
                 bill_id, title, bill_id_parts):
        self.scraper = scraper
        self.session = session
        self.term = term_for_session('ny', session)
        for data in self.metadata['terms']:
            if session in data['sessions']:
                self.termdata = data
            self.term_start_year = data['start_year']
        self.chamber = chamber
        self.url = url
        self.doc = doc
        self.bill_id = bill_id
        self.letter, self.number, self.version = bill_id_parts
        self.data = {}
        self.bill = Bill(session, chamber, bill_id, title, type=bill_type)
        self.succeeded = False

        self._build()
Exemple #17
0
def import_bill(data, votes):
    level = data['level']
    abbr = data[level]
    # clean up bill_id
    data['bill_id'] = fix_bill_id(data['bill_id'])

    # move subjects to scraped_subjects
    subjects = data.pop('subjects', None)

    # NOTE: intentionally doesn't copy blank lists of subjects
    # this avoids the problem where a bill is re-run but we can't
    # get subjects anymore (quite common)
    if subjects:
        data['scraped_subjects'] = subjects

    # add loaded votes to data
    bill_votes = votes.pop((data['chamber'], data['session'], data['bill_id']),
                           [])
    data['votes'].extend(bill_votes)

    bill = db.bills.find_one({
        'level': level,
        level: abbr,
        'session': data['session'],
        'chamber': data['chamber'],
        'bill_id': data['bill_id']
    })

    vote_matcher = VoteMatcher(abbr)
    if bill:
        vote_matcher.learn_vote_ids(bill['votes'])
    vote_matcher.set_vote_ids(data['votes'])

    # match sponsor leg_ids
    for sponsor in data['sponsors']:
        id = get_legislator_id(abbr, data['session'], None, sponsor['name'])
        sponsor['leg_id'] = id

    for vote in data['votes']:

        # committee_ids
        if 'committee' in vote:
            committee_id = get_committee_id(level, abbr, vote['chamber'],
                                            vote['committee'])
            vote['committee_id'] = committee_id

        # vote leg_ids
        for vtype in ('yes_votes', 'no_votes', 'other_votes'):
            svlist = []
            for svote in vote[vtype]:
                id = get_legislator_id(abbr, data['session'], vote['chamber'],
                                       svote)
                svlist.append({'name': svote, 'leg_id': id})

            vote[vtype] = svlist

    data['_term'] = term_for_session(abbr, data['session'])

    # Merge any version titles into the alternate_titles list
    alt_titles = set(data.get('alternate_titles', []))
    for version in data['versions']:
        if 'title' in version:
            alt_titles.add(version['title'])
        if '+short_title' in version:
            alt_titles.add(version['+short_title'])
    try:
        # Make sure the primary title isn't included in the
        # alternate title list
        alt_titles.remove(data['title'])
    except KeyError:
        pass
    data['alternate_titles'] = list(alt_titles)

    # update keywords
    data['_keywords'] = list(bill_keywords(data))

    if not bill:
        insert_with_id(data)
    else:
        update(bill, data, db.bills)
Exemple #18
0
def scan_bills(abbr):
    metadata = db.metadata.find_one({'_id': abbr})
    level = metadata['level']

    duplicate_sources = defaultdict(int)
    duplicate_versions = defaultdict(int)
    other_actions = defaultdict(int)
    uncategorized_subjects = defaultdict(int)
    sessions = defaultdict(_bill_report_dict)

    for bill in db.bills.find({'level': level, level: abbr}):
        session_d = sessions[bill['session']]

        # chamber count & bill_types
        if bill['chamber'] == 'lower':
            session_d['lower_count'] += 1
        elif bill['chamber'] == 'upper':
            session_d['upper_count'] += 1
        for type in bill['type']:
            session_d['bill_types'][type] += 1

        update_common(bill, session_d)

        # actions
        last_date = datetime.datetime(1900, 1, 1)
        for action in bill['actions']:
            date = action['date']
            if date < last_date:
                session_d['actions_unsorted'].add(bill['_id'])
            session_d['action_count'] += 1
            for type in action['type']:
                session_d['actions_per_type'][type] += 1
            if 'other' in action['type']:
                other_actions[action['action']] += 1
            session_d['actions_per_actor'][action['actor']] += 1
            session_d['actions_per_month'][date.strftime('%Y-%m')] += 1
        if not bill['actions']:
            session_d['actionless_count'] += 1

        # sponsors
        for sponsor in bill['sponsors']:
            session_d['_sponsor_count'] += 1
            if sponsor.get('leg_id'):
                session_d['_sponsors_with_leg_id_count'] += 1
            else:
                # keep missing leg_ids
                session_d['unmatched_leg_ids'].add(
                    (term_for_session(abbr, bill['session']), bill['chamber'],
                     sponsor['name']))
            session_d['sponsors_per_type'][sponsor['type']] += 1
        if not bill['sponsors']:
            session_d['sponsorless_count'] += 1

        # votes
        for vote in bill['votes']:
            session_d['vote_count'] += 1
            if vote['passed']:
                session_d['_passed_vote_count'] += 1
            session_d['votes_per_chamber'][vote['chamber']] += 1
            if not vote.get('type'):
                logger.warning('vote is missing type on %s' % bill['_id'])
                continue
            session_d['votes_per_type'][vote.get('type')] += 1
            if not vote.get('date'):
                logger.warning('vote is missing date on %s' % bill['_id'])
                continue
            session_d['votes_per_month'][vote['date'].strftime('%Y-%m')] += 1

            # roll calls
            has_rollcalls = False
            for rc in (vote['yes_votes'] + vote['no_votes'] +
                       vote['other_votes']):
                has_rollcalls = True
                session_d['_rollcall_count'] += 1
                if rc.get('leg_id'):
                    session_d['_rollcalls_with_leg_id_count'] += 1
                else:
                    # keep missing leg_ids
                    session_d['unmatched_leg_ids'].add(
                        (term_for_session(abbr, bill['session']),
                         vote['chamber'], rc['name']))

            # check counts if any rollcalls are present
            if (has_rollcalls
                    and (len(vote['yes_votes']) != vote['yes_count']
                         or len(vote['no_votes']) != vote['no_count']
                         or len(vote['other_votes']) != vote['other_count'])):
                session_d['bad_vote_counts'].add(bill['_id'])

        # subjects
        for subj in bill.get('scraped_subjects', []):
            uncategorized_subjects[subj] += 1
        if bill.get('subjects'):
            session_d['_subjects_count'] += 1
            for subject in bill['subjects']:
                session_d['bills_per_subject'][subject] += 1

        # sources
        for source in bill['sources']:
            duplicate_sources[source['url']] += 1

        # versions
        if not bill['versions']:
            # total num of bills w/o versions
            session_d['versionless_count'] += 1
        else:
            # total num of versions
            session_d['version_count'] += len(bill['versions'])
        for doc in bill['versions']:
            duplicate_versions[doc['url']] += 1
        # TODO: add a duplicate documents back in?

    dup_version_urls = []
    dup_source_urls = []
    for url, n in duplicate_versions.iteritems():
        if n > 1:
            dup_version_urls.append(url)
    for url, n in duplicate_sources.iteritems():
        if n > 1:
            dup_source_urls.append(url)

    return {
        'duplicate_versions': dup_version_urls,
        'duplicate_sources': dup_source_urls,
        'other_actions': other_actions.items(),
        'uncategorized_subjects': uncategorized_subjects.items(),
        'sessions': sessions,
    }
def dump_missing_leg_ids(abbr, detailed=False):
    """
    For a given abbr, find all of the sponsorships, votes and committee
    memberships which are missing legislator IDs and output them to
    CSV files.
    """
    missing_csv = csv.writer(open('%s_missing_leg_ids.csv' % abbr, 'w'))
    missing_csv.writerow(('term', 'chamber', 'name'))
    missing = set()

    level = metadata(abbr)['level']

    if detailed:
        sponsor_csv = csv.writer(
            open('%s_missing_sponsor_leg_ids.csv' % abbr, 'w'))
        sponsor_csv.writerow(("Abbreviation", "Session", "Chamber", "Bill ID",
                              "Sponsor Type", "Legislator Name"))

        vote_csv = csv.writer(open("%s_missing_vote_leg_ids.csv" % abbr, 'w'))
        vote_csv.writerow(
            ("Abbreviation", "Session", "Chamber", "Bill ID", "Vote Index",
             "Vote Chamber", "Vote Motion", "Vote", "Name"))

    for bill in db.bills.find({'level': level, level: abbr}):
        for sponsor in bill['sponsors']:
            if not sponsor['leg_id']:
                missing.add(
                    (term_for_session(abbr, bill['session']), bill['chamber'],
                     sponsor['name'].encode('ascii', 'replace')))

                if detailed:
                    sponsor_csv.writerow(
                        (abbr, bill['session'], bill['chamber'],
                         bill['bill_id'], sponsor['type'],
                         sponsor['name'].encode('ascii', 'replace')))

        i = 0
        for vote in bill['votes']:
            for vtype in ('yes', 'no', 'other'):
                for v in vote["%s_votes" % vtype]:
                    if not v['leg_id']:
                        missing.add((term_for_session(abbr, bill['session']),
                                     vote['chamber'],
                                     v['name'].encode('ascii', 'replace')))

                        if detailed:
                            vote_csv.writerow(
                                (abbr, bill['session'], bill['chamber'],
                                 bill['bill_id'], i, vote['chamber'],
                                 vote['motion'], vtype,
                                 v['name'].encode('ascii', 'replace')))
            i += 1

    if detailed:
        comm_csv = csv.writer(
            open("%s_missing_committee_leg_ids.csv" % abbr, 'w'))
        comm_csv.writerow(("Abbreviation", "Chamber", "Committee",
                           "Subcommittee", "Role", "Name"))

    for committee in db.committees.find({'level': level, level: abbr}):
        for member in committee['members']:
            if not member['leg_id']:
                missing.add((committee.get('term', ''), committee['chamber'],
                             member['name'].encode('ascii', 'replace')))

                if detailed:
                    com = committee['committee'].encode('ascii', 'replace')
                    subcom = (committee['subcommittee']
                              or u'').encode('ascii', 'replace')
                    comm_csv.writerow(
                        (abbr, committee['chamber'], com, subcom,
                         member['role'],
                         member['name'].encode('ascii', 'replace')))

    for item in missing:
        missing_csv.writerow(item)
Exemple #20
0
def main():
    try:
        parser = argparse.ArgumentParser(
            description='update billy data',
            parents=[base_arg_parser],
        )

        what = parser.add_argument_group(
            'what to scrape', 'flags that help select what data to scrape')
        scrape = parser.add_argument_group('scraper config',
                                           'settings for the scraper')

        parser.add_argument('module', type=str, help='scraper module (eg. nc)')
        parser.add_argument('--pdb', action='store_true', default=False,
                            help='invoke PDB when exception is raised')
        parser.add_argument('--ipdb', action='store_true', default=False,
                            help='invoke PDB when exception is raised')
        parser.add_argument('--pudb', action='store_true', default=False,
                            help='invoke PUDB when exception is raised')
        what.add_argument('-s', '--session', action='append',
                          dest='sessions', default=[],
                          help='session(s) to scrape')
        what.add_argument('-t', '--term', action='append', dest='terms',
                          help='term(s) to scrape', default=[])

        for arg in ('upper', 'lower'):
            what.add_argument('--' + arg, action='append_const',
                              dest='chambers', const=arg)
        for arg in ('bills', 'legislators', 'committees',
                    'votes', 'events', 'speeches'):
            what.add_argument('--' + arg, action='append_const', dest='types',
                              const=arg)
        for arg in ('scrape', 'import', 'report', 'session-list'):
            parser.add_argument('--' + arg, dest='actions',
                                action="append_const", const=arg,
                                help='only run %s step' % arg)

        # special modes for debugging
        scrape.add_argument('--nonstrict', action='store_false', dest='strict',
                            default=True, help="don't fail immediately when"
                            " encountering validation warning")
        scrape.add_argument('--fastmode', help="scrape in fast mode",
                            action="store_true", default=False)

        # scrapelib overrides
        scrape.add_argument('-r', '--rpm', action='store', type=int,
                            dest='SCRAPELIB_RPM')
        scrape.add_argument('--timeout', action='store', type=int,
                            dest='SCRAPELIB_TIMEOUT')
        scrape.add_argument('--retries', type=int,
                            dest='SCRAPELIB_RETRY_ATTEMPTS')
        scrape.add_argument('--retry_wait', type=int,
                            dest='SCRAPELIB_RETRY_WAIT_SECONDS')

        args = parser.parse_args()

        if args.pdb or args.pudb or args.ipdb:
            _debugger = pdb
            if args.pudb:
                try:
                    import pudb
                    _debugger = pudb
                except ImportError:
                    pass
            if args.ipdb:
                try:
                    import ipdb
                    _debugger = ipdb
                except ImportError:
                    pass

            # turn on PDB-on-error mode
            # stolen from http://stackoverflow.com/questions/1237379/
            # if this causes problems in interactive mode check that page
            def _tb_info(type, value, tb):
                traceback.print_exception(type, value, tb)
                _debugger.pm()
            sys.excepthook = _tb_info

        # inject scraper paths so scraper module can be found
        for newpath in settings.SCRAPER_PATHS:
            sys.path.insert(0, newpath)

        # get metadata
        module = importlib.import_module(args.module)
        metadata = module.metadata
        module_settings = getattr(module, 'settings', {})
        abbrev = metadata['abbreviation']

        # load module settings, then command line settings
        settings.update(module_settings)
        settings.update(args)

        # make output dir
        args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbrev)

        # if terms aren't set, use latest
        if not args.terms:
            if args.sessions:
                for session in args.sessions:
                    args.terms.append(
                        term_for_session(metadata['abbreviation'], session,
                                         metadata))
                args.terms = list(set(args.terms or []))
            else:
                latest_term = metadata['terms'][-1]['name']
                args.terms = [latest_term]
        # only set sessions from terms if sessions weren't set
        elif not args.sessions:
            for term in metadata['terms']:
                if term['name'] in args.terms:
                    args.sessions.extend(term['sessions'])
            # dedup sessions
            args.sessions = list(set(args.sessions or []))

        if not args.sessions:
            args.sessions = [metadata['terms'][-1]['sessions'][-1]]

        # determine chambers
        if not args.chambers:
            args.chambers = ['upper', 'lower']

        if not args.actions:
            args.actions = ['scrape', 'import', 'report']

        if not args.types:
            args.types = ['bills', 'legislators', 'votes', 'committees',
                          'alldata']

            if 'events' in metadata['feature_flags']:
                args.types.append('events')

            if 'speeches' in metadata['feature_flags']:
                args.types.append('speeches')

        plan = """billy-update abbr=%s
    actions=%s
    types=%s
    sessions=%s
    terms=%s""" % (args.module, ','.join(args.actions), ','.join(args.types),
                   ','.join(args.sessions), ','.join(args.terms))
        logging.getLogger('billy').info(plan)

        scrape_data = {}

        if 'scrape' in args.actions:
            _clear_scraped_data(args.output_dir)

            # validate then write metadata
            if hasattr(module, 'session_list'):
                session_list = module.session_list()
            else:
                session_list = []
            check_sessions(metadata, session_list)

            try:
                schema_path = os.path.join(os.path.split(__file__)[0],
                                           '../schemas/metadata.json')
                schema = json.load(open(schema_path))

                validator = DatetimeValidator()
                validator.validate(metadata, schema)
            except ValueError as e:
                logging.getLogger('billy').warning(
                    'metadata validation error: ' + str(e))

            run_record = []
            exec_record = {
                "run_record": run_record,
                "args": sys.argv,
            }

            lex = None
            exc_traceback = None

            # start to run scrapers
            exec_start = dt.datetime.utcnow()

            # scraper order matters
            order = ('legislators', 'committees', 'votes', 'bills',
                     'events', 'speeches')
            _traceback = None
            try:
                for stype in order:
                    if stype in args.types:
                        run_record += _run_scraper(stype, args, metadata)
            except Exception as e:
                _traceback = _, _, exc_traceback = sys.exc_info()
                run_record += [{"exception": e, "type": stype}]
                lex = e

            exec_end = dt.datetime.utcnow()
            exec_record['started'] = exec_start
            exec_record['ended'] = exec_end
            scrape_data['scraped'] = exec_record
            scrape_data['abbr'] = abbrev

            for record in run_record:
                if "exception" in record:
                    ex = record['exception']
                    fb = traceback.format_exception(*_traceback)
                    trace = ""
                    for t in fb:
                        trace += t
                    record['exception'] = {
                        "type": ex.__class__.__name__,
                        "message": ex.message,
                        'traceback': trace
                    }
                    scrape_data['failure'] = True
            if lex:
                if 'import' in args.actions:
                    try:
                        db.billy_runs.save(scrape_data, safe=True)
                    except Exception:
                        raise lex, None, exc_traceback
                        # XXX: This should *NEVER* happen, but it has
                        # in the past, so we're going to catch any errors
                        # writing # to pymongo, and raise the original
                        # exception rather then let it look like Mongo's fault.
                        # Thanks for catching this, Thom.
                        #
                        # We lose the stack trace, but the Exception is the
                        # same in every other way.
                        #  -- paultag
                raise

        # imports
        if 'import' in args.actions:
            import_report = _do_imports(abbrev, args)
            scrape_data['imported'] = import_report
            # We're tying the run-logging into the import stage - since import
            # already writes to the DB, we might as well throw this in too.
            db.billy_runs.save(scrape_data, safe=True)

        # reports
        if 'report' in args.actions:
            _do_reports(abbrev, args)

        if 'session-list' in args.actions:
            if hasattr(module, 'session_list'):
                print("\n".join(module.session_list()))
            else:
                raise ScrapeError('session_list() is not defined')

    except ScrapeError as e:
        logging.getLogger('billy').critical('Error: %s', e)
        sys.exit(1)
Exemple #21
0
def import_bill(data, standalone_votes, categorizer):
    """
        insert or update a bill

        data - raw bill JSON
        standalone_votes - votes scraped separately
        categorizer - SubjectCategorizer (None - no categorization)
    """
    abbr = data[settings.LEVEL_FIELD]

    # clean up bill_ids
    data['bill_id'] = fix_bill_id(data['bill_id'])
    if 'alternate_bill_ids' in data:
        data['alternate_bill_ids'] = [fix_bill_id(bid) for bid in
                                      data['alternate_bill_ids']]

    # move subjects to scraped_subjects
    # NOTE: intentionally doesn't copy blank lists of subjects
    # this avoids the problem where a bill is re-run but we can't
    # get subjects anymore (quite common)
    subjects = data.pop('subjects', None)
    if subjects:
        data['scraped_subjects'] = subjects

    # update categorized subjects
    if categorizer:
        categorizer.categorize_bill(data)

    # companions
    for companion in data['companions']:
        companion['bill_id'] = fix_bill_id(companion['bill_id'])
        # query based on companion
        spec = companion.copy()
        spec[settings.LEVEL_FIELD] = abbr
        if not spec['chamber']:
            spec.pop('chamber')
        companion_obj = db.bills.find_one(spec)
        if companion_obj:
            companion['internal_id'] = companion_obj['_id']
        else:
            logger.warning('Unknown companion: {chamber} {session} {bill_id}'
                           .format(**companion))

    # look for a prior version of this bill
    bill = db.bills.find_one({settings.LEVEL_FIELD: abbr,
                              'session': data['session'],
                              'chamber': data['chamber'],
                              'bill_id': data['bill_id']})

    # keep doc ids consistent
    doc_matcher = DocumentMatcher(abbr)
    if bill:
        doc_matcher.learn_ids(bill['versions'] + bill['documents'])
    doc_matcher.set_ids(data['versions'] + data['documents'])

    # match sponsor leg_ids
    match_sponsor_ids(abbr, data)

    # process votes ############

    # pull votes off bill
    bill_votes = data.pop('votes', [])

    # grab the external bill votes if present
    if metadata(abbr).get('_partial_vote_bill_id'):
        # this is a hack initially added for Rhode Island where we can't
        # determine the full bill_id, if this key is in the metadata
        # we just use the numeric portion, not ideal as it won't work
        # where HB/SBs overlap, but in RI they never do
        # pull off numeric portion of bill_id
        numeric_bill_id = data['bill_id'].split()[1]
        bill_votes += standalone_votes.pop((data['chamber'], data['session'],
                                            numeric_bill_id), [])
    else:
        # add loaded votes to data
        bill_votes += standalone_votes.pop((data['chamber'], data['session'],
                                            data['bill_id']), [])

    # do id matching and other vote prep
    if bill:
        prepare_votes(abbr, data['session'], bill['_id'], bill_votes)
    else:
        prepare_votes(abbr, data['session'], None, bill_votes)

    # process actions ###########

    dates = {'first': None, 'last': None, 'passed_upper': None,
             'passed_lower': None, 'signed': None}

    vote_flags = {
        "bill:passed",
        "bill:failed",
        "bill:veto_override:passed",
        "bill:veto_override:failed",
        "amendment:passed",
        "amendment:failed",
        "committee:passed",
        "committee:passed:favorable",
        "committee:passed:unfavorable",
        "committee:passed:failed"
    }
    already_linked = set()
    remove_vote = set()

    for action in data['actions']:
        adate = action['date']

        def _match_committee(name):
            return get_committee_id(abbr, action['actor'], name)

        def _match_legislator(name):
            return get_legislator_id(abbr,
                                     data['session'],
                                     action['actor'],
                                     name)

        resolvers = {
            "committee": _match_committee,
            "legislator": _match_legislator
        }

        if "related_entities" in action:
            for entity in action['related_entities']:
                try:
                    resolver = resolvers[entity['type']]
                except KeyError as e:
                    # We don't know how to deal.
                    logger.error("I don't know how to sort a %s" % e)
                    continue

                id = resolver(entity['name'])
                entity['id'] = id

        # first & last dates
        if not dates['first'] or adate < dates['first']:
            dates['first'] = adate
        if not dates['last'] or adate > dates['last']:
            dates['last'] = adate

        # passed & signed dates
        if (not dates['passed_upper'] and action['actor'] == 'upper'
                and 'bill:passed' in action['type']):
            dates['passed_upper'] = adate
        elif (not dates['passed_lower'] and action['actor'] == 'lower'
                and 'bill:passed' in action['type']):
            dates['passed_lower'] = adate
        elif (not dates['signed'] and 'governor:signed' in action['type']):
            dates['signed'] = adate

        # vote-action matching
        action_attached = False
        # only attempt vote matching if action has a date and is one of the
        # designated vote action types
        if set(action['type']).intersection(vote_flags) and action['date']:
            for vote in bill_votes:
                if not vote['date']:
                    continue

                delta = abs(vote['date'] - action['date'])
                if (delta < datetime.timedelta(hours=20) and
                        vote['chamber'] == action['actor']):
                    if action_attached:
                        # multiple votes match, we can't guess
                        action.pop('related_votes', None)
                    else:
                        related_vote = vote['vote_id']
                        if related_vote in already_linked:
                            remove_vote.add(related_vote)

                        already_linked.add(related_vote)
                        action['related_votes'] = [related_vote]
                        action_attached = True

    # remove related_votes that we linked to multiple actions
    for action in data['actions']:
        for vote in remove_vote:
            if vote in action.get('related_votes', []):
                action['related_votes'].remove(vote)

    # save action dates to data
    data['action_dates'] = dates

    data['_term'] = term_for_session(abbr, data['session'])

    alt_titles = set(data.get('alternate_titles', []))

    for version in data['versions']:
        # add/update tracked_versions collection
        track_version(data, version)

        # Merge any version titles into the alternate_titles list
        if 'title' in version:
            alt_titles.add(version['title'])
        if '+short_title' in version:
            alt_titles.add(version['+short_title'])
    try:
        # Make sure the primary title isn't included in the
        # alternate title list
        alt_titles.remove(data['title'])
    except KeyError:
        pass
    data['alternate_titles'] = list(alt_titles)
    data = apply_filters(filters, data)

    if not bill:
        insert_with_id(data)
        git_add_bill(data)
        save_votes(data, bill_votes)
        return "insert"
    else:
        update(bill, data, db.bills)
        git_add_bill(bill)
        save_votes(bill, bill_votes)
        return "update"
Exemple #22
0
def main():
    try:
        parser = argparse.ArgumentParser(
            description='update billy data',
            parents=[base_arg_parser],
        )

        what = parser.add_argument_group(
            'what to scrape', 'flags that help select what data to scrape')
        scrape = parser.add_argument_group('scraper config',
                                           'settings for the scraper')

        parser.add_argument('module', type=str, help='scraper module (eg. nc)')
        what.add_argument('-s', '--session', action='append',
                          dest='sessions', default=[],
                          help='session(s) to scrape')
        what.add_argument('-t', '--term', action='append', dest='terms',
                          help='term(s) to scrape', default=[])

        for arg in ('upper', 'lower'):
            what.add_argument('--' + arg, action='append_const',
                              dest='chambers', const=arg)
        for arg in ('bills', 'legislators', 'committees',
                    'votes', 'events', 'speeches'):
            what.add_argument('--' + arg, action='append_const', dest='types',
                              const=arg)
        for arg in ('scrape', 'import', 'report', 'session-list'):
            parser.add_argument('--' + arg, dest='actions',
                                action="append_const", const=arg,
                                help='only run %s step' % arg)

        # special modes for debugging
        scrape.add_argument('--nonstrict', action='store_false', dest='strict',
                            default=True, help="don't fail immediately when"
                            " encountering validation warning")
        scrape.add_argument('--fastmode', help="scrape in fast mode",
                            action="store_true", default=False)

        # scrapelib overrides
        scrape.add_argument('-r', '--rpm', action='store', type=int,
                            dest='SCRAPELIB_RPM')
        scrape.add_argument('--timeout', action='store', type=int,
                            dest='SCRAPELIB_TIMEOUT')
        scrape.add_argument('--retries', type=int,
                            dest='SCRAPELIB_RETRY_ATTEMPTS')
        scrape.add_argument('--retry_wait', type=int,
                            dest='SCRAPELIB_RETRY_WAIT_SECONDS')

        args = parser.parse_args()

        # inject scraper paths so scraper module can be found
        for newpath in settings.SCRAPER_PATHS:
            sys.path.insert(0, newpath)

        # get metadata
        module = importlib.import_module(args.module)
        metadata = module.metadata
        module_settings = getattr(module, 'settings', {})
        abbrev = metadata['abbreviation']

        # load module settings, then command line settings
        settings.update(module_settings)
        settings.update(args)

        # make output dir
        args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbrev)

        # if terms aren't set, use latest
        if not args.terms:
            if args.sessions:
                for session in args.sessions:
                    args.terms.append(
                        term_for_session(metadata['abbreviation'], session,
                                         metadata))
                args.terms = list(set(args.terms or []))
            else:
                latest_term = metadata['terms'][-1]['name']
                args.terms = [latest_term]
        # only set sessions from terms if sessions weren't set
        elif not args.sessions:
            for term in metadata['terms']:
                if term['name'] in args.terms:
                    args.sessions.extend(term['sessions'])
            # dedup sessions
            args.sessions = list(set(args.sessions or []))

        if not args.sessions:
            args.sessions = [metadata['terms'][-1]['sessions'][-1]]

        # determine chambers
        if not args.chambers:
            args.chambers = ['upper', 'lower']

        if not args.actions:
            args.actions = ['scrape', 'import', 'report']

        if not args.types:
            args.types = ['bills', 'legislators', 'votes', 'committees',
                          'alldata']

            if 'events' in metadata['feature_flags']:
                args.types.append('events')

            if 'speeches' in metadata['feature_flags']:
                args.types.append('speeches')

        plan = """billy-update abbr=%s
    actions=%s
    types=%s
    sessions=%s
    terms=%s""" % (args.module, ','.join(args.actions), ','.join(args.types),
                   ','.join(args.sessions), ','.join(args.terms))
        logging.getLogger('billy').info(plan)

        scrape_data = {}

        if 'scrape' in args.actions:
            _clear_scraped_data(args.output_dir)

            # validate then write metadata
            if hasattr(module, 'session_list'):
                session_list = module.session_list()
            else:
                session_list = []
            check_sessions(metadata, session_list)

            try:
                schema_path = os.path.join(os.path.split(__file__)[0],
                                           '../schemas/metadata.json')
                schema = json.load(open(schema_path))

                validator = DatetimeValidator()
                validator.validate(metadata, schema)
            except ValueError as e:
                logging.getLogger('billy').warning(
                    'metadata validation error: ' + str(e))

            run_record = []
            exec_record = {
                "run_record": run_record,
                "args": sys.argv,
            }

            lex = None
            exc_traceback = None

            # start to run scrapers
            exec_start = dt.datetime.utcnow()

            # scraper order matters
            order = ('legislators', 'committees', 'votes', 'bills',
                     'events', 'speeches')
            _traceback = None
            try:
                for stype in order:
                    if stype in args.types:
                        run_record += _run_scraper(stype, args, metadata)
            except Exception as e:
                _traceback = _, _, exc_traceback = sys.exc_info()
                run_record += [{"exception": e, "type": stype}]
                lex = e

            exec_end = dt.datetime.utcnow()
            exec_record['started'] = exec_start
            exec_record['ended'] = exec_end
            scrape_data['scraped'] = exec_record
            scrape_data['abbr'] = abbrev

            for record in run_record:
                if "exception" in record:
                    ex = record['exception']
                    fb = traceback.format_exception(*_traceback)
                    trace = ""
                    for t in fb:
                        trace += t
                    record['exception'] = {
                        "type": ex.__class__.__name__,
                        "message": ex.message,
                        'traceback': trace
                    }
                    scrape_data['failure'] = True
            if lex:
                if 'import' in args.actions:
                    try:
                        db.billy_runs.save(scrape_data, safe=True)
                    except Exception:
                        raise lex, None, exc_traceback
                        # XXX: This should *NEVER* happen, but it has
                        # in the past, so we're going to catch any errors
                        # writing # to pymongo, and raise the original
                        # exception rather then let it look like Mongo's fault.
                        # Thanks for catching this, Thom.
                        #
                        # We lose the stack trace, but the Exception is the
                        # same in every other way.
                        #  -- paultag
                raise

        # imports
        if 'import' in args.actions:
            import_report = _do_imports(abbrev, args)
            scrape_data['imported'] = import_report
            # We're tying the run-logging into the import stage - since import
            # already writes to the DB, we might as well throw this in too.
            db.billy_runs.save(scrape_data, safe=True)

        # reports
        if 'report' in args.actions:
            _do_reports(abbrev, args)

        if 'session-list' in args.actions:
            if hasattr(module, 'session_list'):
                print("\n".join(module.session_list()))
            else:
                raise ScrapeError('session_list() is not defined')

    except ScrapeError as e:
        logging.getLogger('billy').critical('Error: %s', e)
        sys.exit(1)
Exemple #23
0
def import_bill(data, votes):
    level = data['level']
    abbr = data[level]
    # clean up bill_id
    data['bill_id'] = fix_bill_id(data['bill_id'])

    # move subjects to scraped_subjects
    subjects = data.pop('subjects', None)

    # NOTE: intentionally doesn't copy blank lists of subjects
    # this avoids the problem where a bill is re-run but we can't
    # get subjects anymore (quite common)
    if subjects:
        data['scraped_subjects'] = subjects

    # add loaded votes to data
    bill_votes = votes.pop((data['chamber'], data['session'],
                            data['bill_id']), [])
    data['votes'].extend(bill_votes)

    bill = db.bills.find_one({'level': level, level: abbr,
                              'session': data['session'],
                              'chamber': data['chamber'],
                              'bill_id': data['bill_id']})

    vote_matcher = VoteMatcher(abbr)
    if bill:
        vote_matcher.learn_vote_ids(bill['votes'])
    vote_matcher.set_vote_ids(data['votes'])

    # match sponsor leg_ids
    for sponsor in data['sponsors']:
        id = get_legislator_id(abbr, data['session'], None,
                               sponsor['name'])
        sponsor['leg_id'] = id

    for vote in data['votes']:

        # committee_ids
        if 'committee' in vote:
            committee_id = get_committee_id(level, abbr, vote['chamber'],
                                            vote['committee'])
            vote['committee_id'] = committee_id

        # vote leg_ids
        for vtype in ('yes_votes', 'no_votes', 'other_votes'):
            svlist = []
            for svote in vote[vtype]:
                id = get_legislator_id(abbr, data['session'],
                                       vote['chamber'], svote)
                svlist.append({'name': svote, 'leg_id': id})

            vote[vtype] = svlist

    data['_term'] = term_for_session(abbr, data['session'])

    # Merge any version titles into the alternate_titles list
    alt_titles = set(data.get('alternate_titles', []))
    for version in data['versions']:
        if 'title' in version:
            alt_titles.add(version['title'])
        if '+short_title' in version:
            alt_titles.add(version['+short_title'])
    try:
        # Make sure the primary title isn't included in the
        # alternate title list
        alt_titles.remove(data['title'])
    except KeyError:
        pass
    data['alternate_titles'] = list(alt_titles)

    # update keywords
    data['_keywords'] = list(bill_keywords(data))

    if not bill:
        insert_with_id(data)
    else:
        update(bill, data, db.bills)
Exemple #24
0
def scan_bills(abbr):
    duplicate_sources = defaultdict(int)
    duplicate_versions = defaultdict(int)
    other_actions = defaultdict(int)
    uncategorized_subjects = defaultdict(int)
    sessions = defaultdict(_bill_report_dict)

    # load exception data into sets of ids indexed by exception type
    quality_exceptions = get_quality_exceptions(abbr)

    for bill in db.bills.find({settings.LEVEL_FIELD: abbr}):
        session_d = sessions[bill['session']]

        # chamber count & bill_types
        if bill['chamber'] == 'lower':
            session_d['lower_count'] += 1
        elif bill['chamber'] == 'upper':
            session_d['upper_count'] += 1
        for type in bill['type']:
            session_d['bill_types'][type] += 1

        update_common(bill, session_d)

        # actions
        last_date = datetime.datetime(1900, 1, 1)
        for action in bill['actions']:
            date = action['date']
            
            if not date:
              continue 

            if date < last_date:
                session_d['actions_unsorted'].add(bill['_id'])
            session_d['action_count'] += 1
            for type in action['type']:
                session_d['actions_per_type'][type] += 1
            if 'other' in action['type']:
                other_actions[action['action']] += 1
            session_d['actions_per_actor'][action['actor']] += 1
            session_d['actions_per_month'][date.strftime('%Y-%m')] += 1

        # handle no_actions bills
        if not bill['actions']:
            if bill['_id'] not in quality_exceptions['bills:no_actions']:
                session_d['actionless_count'] += 1
            else:
                quality_exceptions['bills:no_actions'].remove(bill['_id'])

        # sponsors
        for sponsor in bill['sponsors']:
            session_d['_sponsor_count'] += 1
            if sponsor.get('leg_id') or sponsor.get('committee_id'):
                session_d['_sponsors_with_id_count'] += 1
            else:
                # keep list of unmatched sponsors
                session_d['unmatched_sponsors'].add(
                    (term_for_session(abbr, bill['session']), bill['chamber'],
                     sponsor['name'])
                )
            session_d['sponsors_per_type'][sponsor['type']] += 1

        # handle no sponsors bills
        if not bill['sponsors']:
            if bill['_id'] not in quality_exceptions['bills:no_sponsors']:
                session_d['sponsorless_count'] += 1
            else:
                quality_exceptions['bills:no_sponsors'].remove(bill['_id'])

        # subjects
        for subj in bill.get('scraped_subjects', []):
            uncategorized_subjects[subj] += 1
        if bill.get('subjects'):
            session_d['_subjects_count'] += 1
            for subject in bill['subjects']:
                session_d['bills_per_subject'][subject] += 1

        # sources
        for source in bill['sources']:
            duplicate_sources[source['url']] += 1

        # versions
        if not bill['versions']:
            # total num of bills w/o versions
            if bill['_id'] not in quality_exceptions['bills:no_versions']:
                session_d['versionless_count'] += 1
            else:
                quality_exceptions['bills:no_versions'].remove(bill['_id'])
        else:
            # total num of versions
            session_d['version_count'] += len(bill['versions'])
        for doc in bill['versions']:
            duplicate_versions[doc['url']] += 1
        # TODO: add duplicate document detection back in?

        # Check for progress meter gaps.
        progress_meter_gaps = session_d['progress_meter_gaps']
        action_dates = bill['action_dates']
        bill_chamber = bill['chamber']
        other_chamber = dict(lower='upper', upper='lower')[bill_chamber]

        # Check for bills that were signed but didn't pass both chambers.
        if bill['type'] == 'bill':
            if action_dates['signed']:
                if not action_dates['passed_upper']:
                    progress_meter_gaps.add(bill['_id'])
                elif not action_dates['passed_lower']:
                    progress_meter_gaps.add(bill['_id'])

        else:
            # Check for nonbills that were signed but didn't pass their
            # house of origin.
            if action_dates['signed']:
                if not action_dates['passed_' + bill_chamber]:
                    progress_meter_gaps.add(bill['_id'])

        if action_dates['passed_' + other_chamber]:
            if not action_dates['passed_' + bill_chamber]:
                progress_meter_gaps.add(bill['_id'])

    dup_version_urls = []
    dup_source_urls = []
    for url, n in duplicate_versions.items():
        if n > 1:
            dup_version_urls.append(url)
    for url, n in duplicate_sources.items():
        if n > 1:
            dup_source_urls.append(url)

    # do logging of unnecessary exceptions
    for qe_type, qes in quality_exceptions.items():
        if qes:
            logger.warning('unnecessary {0} exceptions for {1} bills: \n  {2}'
                           .format(qe_type, len(qes), '\n  '.join(qes)))

    return {'duplicate_versions': dup_version_urls,
            'duplicate_sources': dup_source_urls,
            'other_actions': other_actions.items(),
            'uncategorized_subjects': uncategorized_subjects.items(),
            'sessions': sessions,
            'progress_meter_gaps': []
           }
def dump_missing_leg_ids(abbr, detailed=False):
    """
    For a given abbr, find all of the sponsorships, votes and committee
    memberships which are missing legislator IDs and output them to
    CSV files.
    """
    missing_csv = csv.writer(open('%s_missing_leg_ids.csv' % abbr, 'w'))
    missing_csv.writerow(('term', 'chamber', 'name'))
    missing = set()

    level = metadata(abbr)['level']

    if detailed:
        sponsor_csv = csv.writer(open('%s_missing_sponsor_leg_ids.csv' %
                                      abbr, 'w'))
        sponsor_csv.writerow(("Abbreviation", "Session", "Chamber",
                              "Bill ID", "Sponsor Type", "Legislator Name"))

        vote_csv = csv.writer(open("%s_missing_vote_leg_ids.csv" %
                                   abbr, 'w'))
        vote_csv.writerow(("Abbreviation", "Session", "Chamber", "Bill ID",
                           "Vote Index", "Vote Chamber", "Vote Motion",
                           "Vote", "Name"))

    for bill in db.bills.find({'level': level, level: abbr}):
        for sponsor in bill['sponsors']:
            if not sponsor['leg_id']:
                missing.add((term_for_session(abbr, bill['session']),
                             bill['chamber'],
                             sponsor['name'].encode('ascii', 'replace')))

                if detailed:
                    sponsor_csv.writerow((abbr, bill['session'],
                                          bill['chamber'], bill['bill_id'],
                                          sponsor['type'],
                                          sponsor['name'].encode('ascii',
                                                                 'replace')))

        i = 0
        for vote in bill['votes']:
            for vtype in ('yes', 'no', 'other'):
                for v in vote["%s_votes" % vtype]:
                    if not v['leg_id']:
                        missing.add((term_for_session(abbr, bill['session']),
                                     vote['chamber'],
                                     v['name'].encode('ascii', 'replace')))

                        if detailed:
                            vote_csv.writerow((abbr, bill['session'],
                                               bill['chamber'],
                                               bill['bill_id'],
                                               i, vote['chamber'],
                                               vote['motion'], vtype,
                                               v['name'].encode('ascii',
                                                                'replace')))
            i += 1

    if detailed:
        comm_csv = csv.writer(open("%s_missing_committee_leg_ids.csv" %
                                   abbr, 'w'))
        comm_csv.writerow(("Abbreviation", "Chamber", "Committee",
                           "Subcommittee", "Role", "Name"))

    for committee in db.committees.find({'level': level, level: abbr}):
        for member in committee['members']:
            if not member['leg_id']:
                missing.add((committee.get('term', ''),
                             committee['chamber'],
                             member['name'].encode('ascii', 'replace')))

                if detailed:
                    com = committee['committee'].encode('ascii', 'replace')
                    subcom = (committee['subcommittee'] or u'').encode('ascii',
                                                                   'replace')
                    comm_csv.writerow((abbr, committee['chamber'],
                                       com, subcom, member['role'],
                                       member['name'].encode('ascii',
                                                             'replace')))

    for item in missing:
        missing_csv.writerow(item)
Exemple #26
0
def import_bill(data, votes, categorizer):
    level = data["level"]
    abbr = data[level]

    # clean up bill_ids
    data["bill_id"] = fix_bill_id(data["bill_id"])
    if "alternate_bill_ids" in data:
        data["alternate_bill_ids"] = [fix_bill_id(bid) for bid in data["alternate_bill_ids"]]

    # move subjects to scraped_subjects
    # NOTE: intentionally doesn't copy blank lists of subjects
    # this avoids the problem where a bill is re-run but we can't
    # get subjects anymore (quite common)
    subjects = data.pop("subjects", None)
    if subjects:
        data["scraped_subjects"] = subjects

    # update categorized subjects
    if categorizer:
        categorizer.categorize_bill(data)

    # this is a hack added for Rhode Island where we can't
    # determine the full bill_id, if this key is in the metadata
    # we just use the numeric portion, not ideal as it won't work
    # in states where HB/SBs overlap, but in RI they never do
    if metadata(abbr).get("_partial_vote_bill_id"):
        # pull off numeric portion of bill_id
        numeric_bill_id = data["bill_id"].split()[1]
        bill_votes = votes.pop((data["chamber"], data["session"], numeric_bill_id), [])
    else:
        # add loaded votes to data
        bill_votes = votes.pop((data["chamber"], data["session"], data["bill_id"]), [])

    data["votes"].extend(bill_votes)

    bill = db.bills.find_one(
        {
            "level": level,
            level: abbr,
            "session": data["session"],
            "chamber": data["chamber"],
            "bill_id": data["bill_id"],
        }
    )

    # keep vote/doc ids consistent
    vote_matcher = VoteMatcher(abbr)
    doc_matcher = DocumentMatcher(abbr)
    if bill:
        vote_matcher.learn_ids(bill["votes"])
        doc_matcher.learn_ids(bill["versions"] + bill["documents"])
    vote_matcher.set_ids(data["votes"])
    doc_matcher.set_ids(data["versions"] + data["documents"])

    # match sponsor leg_ids
    for sponsor in data["sponsors"]:
        id = get_legislator_id(abbr, data["session"], None, sponsor["name"])
        sponsor["leg_id"] = id

    for vote in data["votes"]:

        # committee_ids
        if "committee" in vote:
            committee_id = get_committee_id(level, abbr, vote["chamber"], vote["committee"])
            vote["committee_id"] = committee_id

        # vote leg_ids
        for vtype in ("yes_votes", "no_votes", "other_votes"):
            svlist = []
            for svote in vote[vtype]:
                id = get_legislator_id(abbr, data["session"], vote["chamber"], svote)
                svlist.append({"name": svote, "leg_id": id})

            vote[vtype] = svlist

    data["_term"] = term_for_session(abbr, data["session"])

    alt_titles = set(data.get("alternate_titles", []))

    for version in data["versions"]:
        # push versions to oyster
        if settings.ENABLE_OYSTER and "url" in version:
            oysterize_version(data, version)

        # Merge any version titles into the alternate_titles list
        if "title" in version:
            alt_titles.add(version["title"])
        if "+short_title" in version:
            alt_titles.add(version["+short_title"])
    try:
        # Make sure the primary title isn't included in the
        # alternate title list
        alt_titles.remove(data["title"])
    except KeyError:
        pass
    data["alternate_titles"] = list(alt_titles)

    if not bill:
        insert_with_id(data)
        return "insert"
    else:
        update(bill, data, db.bills)
        return "update"
Exemple #27
0
def main(old_scrape_compat=False):
    try:
        parser = argparse.ArgumentParser(
            description='update billy data',
            parents=[base_arg_parser],
        )

        what = parser.add_argument_group(
            'what to scrape',
            'flags that help select what data to scrape')
        scrape = parser.add_argument_group(
            'scraper config',
            'settings for the scraper')

        parser.add_argument('module', type=str, help='scraper module (eg. nc)')
        parser.add_argument('--pdb', action='store_true', default=False,
                            help='invoke PDB when exception is raised')
        parser.add_argument('--ipdb', action='store_true', default=False,
                            help='invoke PDB when exception is raised')
        parser.add_argument('--pudb', action='store_true', default=False,
                            help='invoke PUDB when exception is raised')
        what.add_argument('-s', '--session', action='append',
                          dest='sessions', default=[],
                          help='session(s) to scrape')
        what.add_argument('-t', '--term', action='append', dest='terms',
                          help='term(s) to scrape', default=[])

        for arg in ('upper', 'lower'):
            what.add_argument('--' + arg, action='append_const',
                              dest='chambers', const=arg)

        for arg in (
                'bills',
                'legislators',
                'committees',
                'votes',
                'events',
                'speeches'):
            what.add_argument(
                '--' + arg,
                action='append_const',
                dest='types',
                const=arg)

        for arg in ('scrape', 'import', 'report'):
            parser.add_argument('--' + arg, dest='actions',
                                action="append_const", const=arg,
                                help='only run %s step' % arg)
        # special modes for debugging
        scrape.add_argument('--nonstrict', action='store_false', dest='strict',
                            default=True, help="don't fail immediately when"
                            " encountering validation warning")

        scrape.add_argument('--fastmode', help="scrape in fast mode",
                            action="store_true", default=False)

        scrape.add_argument('--billid', help="scrape only a single bill",
                            action="store", default=False)

        # scrapelib overrides
        scrape.add_argument('-r', '--rpm', action='store', type=int,
                            dest='SCRAPELIB_RPM')
        scrape.add_argument('--timeout', action='store', type=int,
                            dest='SCRAPELIB_TIMEOUT')
        scrape.add_argument('--retries', type=int,
                            dest='SCRAPELIB_RETRY_ATTEMPTS')
        scrape.add_argument('--retry_wait', type=int,
                            dest='SCRAPELIB_RETRY_WAIT_SECONDS')

        args = parser.parse_args()

        if args.pdb or args.pudb or args.ipdb:
            _debugger = pdb
            if args.pudb:
                try:
                    import pudb
                    _debugger = pudb
                except ImportError:
                    pass
            if args.ipdb:
                try:
                    import ipdb
                    _debugger = ipdb
                except ImportError:
                    pass

            # turn on PDB-on-error mode
            # stolen from http://stackoverflow.com/questions/1237379/
            # if this causes problems in interactive mode check that page
            def _tb_info(_type, value, tb):
                traceback.print_exception(_type, value, tb)
                _debugger.pm()
            sys.excepthook = _tb_info

        # inject scraper paths so scraper module can be found
        for newpath in settings.SCRAPER_PATHS:
            sys.path.insert(0, newpath)

        # get metadata
        module = importlib.import_module(args.module)
        metadata = module.metadata
        module_settings = getattr(module, 'settings', {})
        abbrev = metadata['abbreviation']

        # load module settings, then command line settings
        settings.update(module_settings)
        settings.update(args)

        # make output dir
        args.output_dir = os.path.join(settings.BILLY_DATA_DIR, abbrev)

        # if terms aren't set, use latest
        if not args.terms:
            if args.sessions:
                for session in args.sessions:
                    args.terms.append(
                        term_for_session(metadata['abbreviation'], session,
                                         metadata))
                args.terms = list(set(args.terms or []))
            else:
                latest_term = metadata['terms'][-1]['name']
                args.terms = [latest_term]
        # only set sessions from terms if sessions weren't set
        elif not args.sessions:
            for term in metadata['terms']:
                if term['name'] in args.terms:
                    args.sessions.extend(term['sessions'])
            # dedup sessions
            args.sessions = list(set(args.sessions or []))

        if not args.sessions:
            args.sessions = [metadata['terms'][-1]['sessions'][-1]]

        # determine chambers
        if not args.chambers:
            args.chambers = ['upper', 'lower']

        if not args.actions:
            args.actions = ['scrape', 'import', 'report']

        if not args.types:
            args.types = ['bills', 'legislators', 'votes', 'committees',
                          'alldata']

            if 'events' in metadata['feature_flags']:
                args.types.append('events')

            if 'speeches' in metadata['feature_flags']:
                args.types.append('speeches')

        plan = """billy-update abbr=%s
    actions=%s
    types=%s
    sessions=%s
    terms=%s""" % (args.module, ','.join(args.actions), ','.join(args.types),
                   ','.join(args.sessions), ','.join(args.terms))
        _log.info(plan)
        scrape_data = {}

        if args.billid is False:
            _log.debug("No billid filter.")
        else:
            _log.debug("Search for billid: %s" % args.billid)

        if 'scrape' in args.actions:
            _clear_scraped_data(args.output_dir)

            # validate then write metadata
            if hasattr(module, 'session_list'):
                session_list = module.session_list()
            else:
                session_list = []
            check_sessions(metadata, session_list)

            _log.debug("Session List %s" % session_list)
            try:
                schema_path = os.path.join(
                    os.path.split(__file__)[0],
                    '../schemas/metadata.json')
                schema = json.load(open(schema_path))
                validator = DatetimeValidator()
                validator.validate(metadata, schema)
            except ValueError as e:
                _log.warning(
                    'metadata validation error: ' + str(e))

            with open(os.path.join(args.output_dir, 'metadata.json'),
                      'w') as f:
                json.dump(metadata, f, cls=JSONDateEncoder)

            run_record = []
            exec_record = {
                "run_record": run_record,
                "args": sys.argv,
                "state": abbrev
            }

            lex = None
            exc_traceback = None

            # start to run scrapers
            exec_start = dt.datetime.utcnow()

            # scraper order matters
            if args.billid is False:
                order = (
                    'legislators',
                    'committees',
                    'votes',
                    'bills',
                    'events',
                    'speeches')
            else:
                _log.debug("going to process bills")
                order = ('bills',)  # only process the bills

            _traceback = None
            try:
                for stype in order:
                    _log.debug("consider to process %s" % stype)
                    if stype in args.types:
                        _log.debug("going to process %s" % stype)
                        scraper_results = _run_scraper(stype, args, metadata)

                        run_record += scraper_results
                    else:
                        _log.debug("skipping %s" % stype)

            except Exception as e:
                _traceback = _, _, exc_traceback = sys.exc_info()
                run_record += [{"exception": e, "type": stype}]
                lex = e

            exec_end = dt.datetime.utcnow()
            exec_record['started'] = exec_start
            exec_record['ended'] = exec_end
            scrape_data['scraped'] = exec_record
            scrape_data['abbr'] = abbrev

            for record in run_record:
                if "exception" in record:
                    ex = record['exception']
                    fb = traceback.format_exception(*_traceback)
                    trace = ""
                    for t in fb:
                        trace += t
                    record['exception'] = {
                        "type": ex.__class__.__name__,
                        "message": ex,
                        'traceback': trace
                    }
                    scrape_data['failure'] = True
            if lex:
                if 'import' in args.actions:
                    try:
                        _log.debug("scrape_data:")
                        if scrape_data['failure']:
                            _log.debug("Failed")
                            _log.debug(scrape_data)
                        else:
                            _log.debug("OK")
                            _log.debug(scrape_data)
                            db.billy_runs.save(scrape_data, safe=True)

                    except KeyError as e:
                        _log.debug("Caught exception1 :")
                        _log.debug(e)
                        exit(123)

                    except pymongo.errors.OperationFailure as e:
                        _log.debug("Caught exception3 :")
                        _log.debug(e)
                        exit(123)

                    except Exception as e:
                        _log.debug("Caught exception :")
                        _log.debug(e)
                        exit(123)
                        raise lex, None, exc_traceback
                        # XXX: This should *NEVER* happen, but it has
                        # in the past, so we're going to catch any errors
                        # writing # to pymongo, and raise the original
                        # exception rather then let it look like Mongo's fault.
                        # Thanks for catching this, Thom.
                        #
                        # We lose the stack trace, but the Exception is the
                        # same in every other way.
                        #  -- paultag
                raise

        # imports
        if 'import' in args.actions:
            import_report = _do_imports(abbrev, args)
            scrape_data['imported'] = import_report
            # We're tying the run-logging into the import stage - since import
            # already writes to the DB, we might as well throw this in too.
            _log.debug(scrape_data)
            db.billy_runs.save(scrape_data, safe=True)

        # reports
        if 'report' in args.actions:
            _do_reports(abbrev, args)

        if 'session-list' in args.actions:
            if hasattr(module, 'session_list'):
                print("\n".join(module.session_list()))
            else:
                raise ScrapeError('session_list() is not defined')

    except ScrapeError as e:
        _log.debug("in update.py Scrape error")
        _log.debug("Scrape error :%s" % e)
        _log.critical('Error: %s' % e)
        sys.exit(1)

    except TypeError as e:
        _log.debug("Type error")
        _log.critical('TypeError:', e)
        sys.exit(1)

    except NoData as e:
        _log.debug("No Data")
        _log.debug(e)
        _log.critical('No Data:')
        sys.exit(1)

    except NoDoc as e:
        _log.debug("No Doc")
        _log.critical('No Doc:', e)
        sys.exit(1)

    except NoXpath as e:
        _log.debug("No XPath")
        _log.critical('No XPath:', e)
        sys.exit(1)

    except Exception as e:
        _log.debug("Unknown error3")
        _log.debug(e)
        _log.critical('Unknown Error')
        sys.exit(1)
Exemple #28
0
    def context_role(self, bill=None, vote=None, session=None, term=None):
        '''Tell this legislator object which session to use when calculating
        the legisator's context_role for a given bill or vote.
        '''
        # If no hints were given about the context, look for a related bill,
        # then for a related vote.
        if not any([bill, vote, session, term]):
            try:
                bill = self.bill
            except AttributeError:
                # A vote?
                try:
                    vote = self.vote
                except AttributeError:
                    # If we're here, this method was called on a
                    # Legislator was that doesn't have a related bill or vote.
                    return ''

        # If we still have to historical point of reference, figuring
        # out the context role is impossible. Return emtpy string.
        if not any([bill, vote, session, term]):
            return ''

        # First figure out the term.
        if bill is not None:
            term = bill['_term']

        elif vote is not None:
            try:
                _bill = vote.bill
            except AttributeError:
                _bill = BillVote(vote).bill
            if callable(_bill):
                _bill = _bill()
            term = _bill['_term']

        if term is None and session is not None:
            term = term_for_session(self[settings.LEVEL_FIELD], session)

        # Use the term to get the related roles. First look in the current
        # roles list, then fail over to the old_roles list.
        roles = [
            r for r in self['roles']
            if r.get('type') == 'member' and r.get('term') == term
        ]
        roles = filter(None, roles)

        if not roles:
            roles = [
                r for r in self['old_roles'].get(term, [])
                if r.get('type') == 'member'
            ]
        roles = filter(None, roles)

        if not roles:
            # Legislator had no roles for this term. If there is a related
            # bill ro vote, this shouldn't happen, but could if the
            # legislator's roles got deleted.
            return ''

        # If there's only one applicable role, we're done.
        if len(roles) == 1:
            role = roles.pop()
            self['context_role'] = role
            return role

        # If only one of term or session is given and there are multiple roles:
        if not filter(None, [bill, vote]):
            if term is not None:
                role = roles[0]
                self['context_role'] = role
                return role

            # Below, use the date of the related bill or vote to determine
            # which (of multiple) roles applies.
            # Get the context date.
            if session is not None:
                # If we're here, we have multiple roles for a single session.
                # Try to find the correct one in self.metadata,
                # else give up.
                session_data = self.metadata['session_details'][session]
                for role in roles:
                    role_start = role.get('start_date')
                    role_end = role.get('end_date')

                    # Return the first role that overlaps at all with the
                    # session.
                    session_start = session_data.get('start_date')
                    session_end = session_data.get('end_date')
                    if session_start and session_end:
                        started_during = (role_start < session_start <
                                          role_end)
                        ended_during = (role_start < session_end < role_end)
                        if started_during or ended_during:
                            self['context_role'] = role
                            return role
                    else:
                        continue

                # Return first role from the session?
                role = roles[0]
                self['context_role'] = role
                return role

        if vote is not None:
            date = vote['date']
        if bill is not None:
            date = bill['action_dates']['first']

        dates_exist = False
        for role in roles:
            start_date = role.get('start_date')
            end_date = role.get('end_date')
            if start_date and end_date:
                dates_exist = True
                if start_date < date < end_date:
                    self['context_role'] = role
                    return role

        if dates_exist:
            # If we're here, the context date didn't fall into any of the
            # legislator's role date ranges.
            return ''

        else:
            # Here the roles didn't have date ranges. Return the last one?
            role = roles.pop()
            self['context_role'] = role
            return role

        return ''
Exemple #29
0
    def context_role(self, bill=None, vote=None, session=None, term=None):
        '''Tell this legislator object which session to use when calculating
        the legisator's context_role for a given bill or vote.
        '''
        # If no hints were given about the context, look for a related bill,
        # then for a related vote.
        if not any([bill, vote, session, term]):
            try:
                bill = self.bill
            except AttributeError:
                # A vote?
                try:
                    vote = self.vote
                except AttributeError:
                    # If we're here, this method was called on a
                    # Legislator was that doesn't have a related bill or vote.
                    return ''

        # If we still have to historical point of reference, figuring
        # out the context role is impossible. Return emtpy string.
        if not any([bill, vote, session, term]):
            return ''

        # First figure out the term.
        if bill is not None:
            term = bill['_term']

        elif vote is not None:
            try:
                _bill = vote.bill
            except AttributeError:
                _bill = BillVote(vote).bill
            if callable(_bill):
                _bill = _bill()
            term = _bill['_term']

        if term is None and session is not None:
            term = term_for_session(self[settings.LEVEL_FIELD], session)

        # Use the term to get the related roles. First look in the current
        # roles list, then fail over to the old_roles list.
        roles = [r for r in self['roles']
                 if r.get('type') == 'member' and r.get('term') == term]
        roles = filter(None, roles)

        if not roles:
            roles = [r for r in self['old_roles'].get(term, [])
                     if r.get('type') == 'member']
        roles = filter(None, roles)

        if not roles:
            # Legislator had no roles for this term. If there is a related
            # bill ro vote, this shouldn't happen, but could if the
            # legislator's roles got deleted.
            return ''

        # If there's only one applicable role, we're done.
        if len(roles) == 1:
            role = roles.pop()
            self['context_role'] = role
            return role

        # If only one of term or session is given and there are multiple roles:
        if not filter(None, [bill, vote]):
            if term is not None:
                role = roles[0]
                self['context_role'] = role
                return role

            # Below, use the date of the related bill or vote to determine
            # which (of multiple) roles applies.
            # Get the context date.
            if session is not None:
                # If we're here, we have multiple roles for a single session.
                # Try to find the correct one in self.metadata,
                # else give up.
                session_data = self.metadata['session_details'][session]
                for role in roles:
                    role_start = role.get('start_date')
                    role_end = role.get('end_date')

                    # Return the first role that overlaps at all with the
                    # session.
                    session_start = session_data.get('start_date')
                    session_end = session_data.get('end_date')
                    if session_start and session_end:
                        started_during = (role_start < session_start <
                                          role_end)
                        ended_during = (role_start < session_end < role_end)
                        if started_during or ended_during:
                            self['context_role'] = role
                            return role
                    else:
                        continue

                # Return first role from the session?
                role = roles[0]
                self['context_role'] = role
                return role

        if vote is not None:
            date = vote['date']
        if bill is not None:
            date = bill['action_dates']['first']

        dates_exist = False
        for role in roles:
            start_date = role.get('start_date')
            end_date = role.get('end_date')
            if start_date and end_date:
                dates_exist = True
                if start_date < date < end_date:
                    self['context_role'] = role
                    return role

        if dates_exist:
            # If we're here, the context date didn't fall into any of the
            # legislator's role date ranges.
            return ''

        else:
            # Here the roles didn't have date ranges. Return the last one?
            role = roles.pop()
            self['context_role'] = role
            return role

        return ''
Exemple #30
0
def scan_bills(abbr):
    duplicate_sources = defaultdict(int)
    duplicate_versions = defaultdict(int)
    other_actions = defaultdict(int)
    uncategorized_subjects = defaultdict(int)
    sessions = defaultdict(_bill_report_dict)

    # load exception data into sets of ids indexed by exception type
    quality_exceptions = get_quality_exceptions(abbr)

    for bill in db.bills.find({settings.LEVEL_FIELD: abbr}):
        session_d = sessions[bill["session"]]

        # chamber count & bill_types
        if bill["chamber"] == "lower":
            session_d["lower_count"] += 1
        elif bill["chamber"] == "upper":
            session_d["upper_count"] += 1
        for type in bill["type"]:
            session_d["bill_types"][type] += 1

        update_common(bill, session_d)

        # actions
        last_date = datetime.datetime(1900, 1, 1)
        for action in bill["actions"]:
            date = action["date"]
            if date < last_date:
                session_d["actions_unsorted"].add(bill["_id"])
            session_d["action_count"] += 1
            for type in action["type"]:
                session_d["actions_per_type"][type] += 1
            if "other" in action["type"]:
                other_actions[action["action"]] += 1
            session_d["actions_per_actor"][action["actor"]] += 1
            session_d["actions_per_month"][date.strftime("%Y-%m")] += 1

        # handle no_actions bills
        if not bill["actions"]:
            if bill["_id"] not in quality_exceptions["bills:no_actions"]:
                session_d["actionless_count"] += 1
            else:
                quality_exceptions["bills:no_actions"].remove(bill["_id"])

        # sponsors
        for sponsor in bill["sponsors"]:
            session_d["_sponsor_count"] += 1
            if sponsor.get("leg_id") or sponsor.get("committee_id"):
                session_d["_sponsors_with_id_count"] += 1
            else:
                # keep list of unmatched sponsors
                session_d["unmatched_sponsors"].add(
                    (term_for_session(abbr, bill["session"]), bill["chamber"], sponsor["name"])
                )
            session_d["sponsors_per_type"][sponsor["type"]] += 1

        # handle no sponsors bills
        if not bill["sponsors"]:
            if bill["_id"] not in quality_exceptions["bills:no_sponsors"]:
                session_d["sponsorless_count"] += 1
            else:
                quality_exceptions["bills:no_sponsors"].remove(bill["_id"])

        # subjects
        for subj in bill.get("scraped_subjects", []):
            uncategorized_subjects[subj] += 1
        if bill.get("subjects"):
            session_d["_subjects_count"] += 1
            for subject in bill["subjects"]:
                session_d["bills_per_subject"][subject] += 1

        # sources
        for source in bill["sources"]:
            duplicate_sources[source["url"]] += 1

        # versions
        if not bill["versions"]:
            # total num of bills w/o versions
            if bill["_id"] not in quality_exceptions["bills:no_versions"]:
                session_d["versionless_count"] += 1
            else:
                quality_exceptions["bills:no_versions"].remove(bill["_id"])
        else:
            # total num of versions
            session_d["version_count"] += len(bill["versions"])
        for doc in bill["versions"]:
            duplicate_versions[doc["url"]] += 1
        # TODO: add duplicate document detection back in?

    dup_version_urls = []
    dup_source_urls = []
    for url, n in duplicate_versions.iteritems():
        if n > 1:
            dup_version_urls.append(url)
    for url, n in duplicate_sources.iteritems():
        if n > 1:
            dup_source_urls.append(url)

    # do logging of unnecessary exceptions
    for qe_type, qes in quality_exceptions.iteritems():
        if qes:
            logger.warning(
                "unnecessary {0} exceptions for {1} bills: \n  {2}".format(qe_type, len(qes), "\n  ".join(qes))
            )

    return {
        "duplicate_versions": dup_version_urls,
        "duplicate_sources": dup_source_urls,
        "other_actions": other_actions.items(),
        "uncategorized_subjects": uncategorized_subjects.items(),
        "sessions": sessions,
    }
Exemple #31
0
def scan_bills(abbr):
    duplicate_sources = defaultdict(int)
    duplicate_versions = defaultdict(int)
    other_actions = defaultdict(int)
    uncategorized_subjects = defaultdict(int)
    sessions = defaultdict(_bill_report_dict)

    # load exception data into sets of ids indexed by exception type
    quality_exceptions = get_quality_exceptions(abbr)

    for bill in db.bills.find({settings.LEVEL_FIELD: abbr}):
        session_d = sessions[bill['session']]

        # chamber count & bill_types
        if bill['chamber'] == 'lower':
            session_d['lower_count'] += 1
        elif bill['chamber'] == 'upper':
            session_d['upper_count'] += 1
        for type in bill['type']:
            session_d['bill_types'][type] += 1

        update_common(bill, session_d)

        # actions
        last_date = datetime.datetime(1900, 1, 1)
        for action in bill['actions']:
            date = action['date']
            if date < last_date:
                session_d['actions_unsorted'].add(bill['_id'])
            session_d['action_count'] += 1
            for type in action['type']:
                session_d['actions_per_type'][type] += 1
            if 'other' in action['type']:
                other_actions[action['action']] += 1
            session_d['actions_per_actor'][action['actor']] += 1
            session_d['actions_per_month'][date.strftime('%Y-%m')] += 1

        # handle no_actions bills
        if not bill['actions']:
            if bill['_id'] not in quality_exceptions['bills:no_actions']:
                session_d['actionless_count'] += 1
            else:
                quality_exceptions['bills:no_actions'].remove(bill['_id'])

        # sponsors
        for sponsor in bill['sponsors']:
            session_d['_sponsor_count'] += 1
            if sponsor.get('leg_id') or sponsor.get('committee_id'):
                session_d['_sponsors_with_id_count'] += 1
            else:
                # keep list of unmatched sponsors
                session_d['unmatched_sponsors'].add(
                    (term_for_session(abbr, bill['session']), bill['chamber'],
                     sponsor['name'])
                )
            session_d['sponsors_per_type'][sponsor['type']] += 1

        # handle no sponsors bills
        if not bill['sponsors']:
            if bill['_id'] not in quality_exceptions['bills:no_sponsors']:
                session_d['sponsorless_count'] += 1
            else:
                quality_exceptions['bills:no_sponsors'].remove(bill['_id'])

        # subjects
        for subj in bill.get('scraped_subjects', []):
            uncategorized_subjects[subj] += 1
        if bill.get('subjects'):
            session_d['_subjects_count'] += 1
            for subject in bill['subjects']:
                session_d['bills_per_subject'][subject] += 1

        # sources
        for source in bill['sources']:
            duplicate_sources[source['url']] += 1

        # versions
        if not bill['versions']:
            # total num of bills w/o versions
            if bill['_id'] not in quality_exceptions['bills:no_versions']:
                session_d['versionless_count'] += 1
            else:
                quality_exceptions['bills:no_versions'].remove(bill['_id'])
        else:
            # total num of versions
            session_d['version_count'] += len(bill['versions'])
        for doc in bill['versions']:
            duplicate_versions[doc['url']] += 1
        # TODO: add duplicate document detection back in?

        # Check for progress meter gaps.
        progress_meter_gaps = session_d['progress_meter_gaps']
        action_dates = bill['action_dates']
        bill_chamber = bill['chamber']
        other_chamber = dict(lower='upper', upper='lower')[bill_chamber]

        # Check for bills that were signed but didn't pass both chambers.
        if bill['type'] == 'bill':
            if action_dates['signed']:
                if not action_dates['passed_upper']:
                    progress_meter_gaps.add(bill['_id'])
                elif not action_dates['passed_lower']:
                    progress_meter_gaps.add(bill['_id'])

        else:
            # Check for nonbills that were signed but didn't pass their
            # house of origin.
            if action_dates['signed']:
                if not action_dates['passed_' + bill_chamber]:
                    progress_meter_gaps.add(bill['_id'])

        if action_dates['passed_' + other_chamber]:
            if not action_dates['passed_' + bill_chamber]:
                progress_meter_gaps.add(bill['_id'])

    dup_version_urls = []
    dup_source_urls = []
    for url, n in duplicate_versions.items():
        if n > 1:
            dup_version_urls.append(url)
    for url, n in duplicate_sources.items():
        if n > 1:
            dup_source_urls.append(url)

    # do logging of unnecessary exceptions
    for qe_type, qes in quality_exceptions.items():
        if qes:
            logger.warning('unnecessary {0} exceptions for {1} bills: \n  {2}'
                           .format(qe_type, len(qes), '\n  '.join(qes)))

    return {'duplicate_versions': dup_version_urls,
            'duplicate_sources': dup_source_urls,
            'other_actions': other_actions.items(),
            'uncategorized_subjects': uncategorized_subjects.items(),
            'sessions': sessions,
            'progress_meter_gaps': []
           }
Exemple #32
0
def scan_bills(abbr):
    metadata = db.metadata.find_one({'_id': abbr})
    level = metadata['level']

    duplicate_sources = defaultdict(int)
    duplicate_versions = defaultdict(int)
    other_actions = defaultdict(int)
    uncategorized_subjects = defaultdict(int)
    sessions = defaultdict(_bill_report_dict)

    for bill in db.bills.find({'level': level, level: abbr}):
        session_d = sessions[bill['session']]

        # chamber count & bill_types
        if bill['chamber'] == 'lower':
            session_d['lower_count'] += 1
        elif bill['chamber'] == 'upper':
            session_d['upper_count'] += 1
        for type in bill['type']:
            session_d['bill_types'][type] += 1

        update_common(bill, session_d)

        # actions
        last_date = datetime.datetime(1900, 1, 1)
        for action in bill['actions']:
            date = action['date']
            if date < last_date:
                session_d['actions_unsorted'].add(bill['_id'])
            session_d['action_count'] += 1
            for type in action['type']:
                session_d['actions_per_type'][type] += 1
            if 'other' in action['type']:
                other_actions[action['action']] += 1
            session_d['actions_per_actor'][action['actor']] += 1
            session_d['actions_per_month'][date.strftime('%Y-%m')] += 1
        if not bill['actions']:
            session_d['actionless_count'] += 1

        # sponsors
        for sponsor in bill['sponsors']:
            session_d['_sponsor_count'] += 1
            if sponsor.get('leg_id'):
                session_d['_sponsors_with_leg_id_count'] += 1
            else:
                # keep missing leg_ids
                session_d['unmatched_leg_ids'].add(
                    (term_for_session(abbr, bill['session']), bill['chamber'],
                    sponsor['name'])
                )
            session_d['sponsors_per_type'][sponsor['type']] += 1
        if not bill['sponsors']:
            session_d['sponsorless_count'] += 1

        # votes
        for vote in bill['votes']:
            session_d['vote_count'] += 1
            if vote['passed']:
                session_d['_passed_vote_count'] += 1
            session_d['votes_per_chamber'][vote['chamber']] += 1
            if not vote.get('type'):
                logger.warning('vote is missing type on %s' % bill['_id'])
                continue
            session_d['votes_per_type'][vote.get('type')] += 1
            if not vote.get('date'):
                logger.warning('vote is missing date on %s' % bill['_id'])
                continue
            session_d['votes_per_month'][vote['date'].strftime('%Y-%m')] += 1

            # roll calls
            has_rollcalls = False
            for rc in (vote['yes_votes'] + vote['no_votes'] +
                       vote['other_votes']):
                has_rollcalls = True
                session_d['_rollcall_count'] += 1
                if rc.get('leg_id'):
                    session_d['_rollcalls_with_leg_id_count'] += 1
                else:
                    # keep missing leg_ids
                    session_d['unmatched_leg_ids'].add(
                        (term_for_session(abbr, bill['session']),
                         vote['chamber'],
                        rc['name'])
                    )

            # check counts if any rollcalls are present
            if (has_rollcalls and
                (len(vote['yes_votes']) != vote['yes_count'] or
                 len(vote['no_votes']) != vote['no_count'] or
                 len(vote['other_votes']) != vote['other_count'])):
                session_d['bad_vote_counts'].add(bill['_id'])

        # subjects
        for subj in bill.get('scraped_subjects', []):
            uncategorized_subjects[subj] += 1
        if bill.get('subjects'):
            session_d['_subjects_count'] += 1
            for subject in bill['subjects']:
                session_d['bills_per_subject'][subject] += 1

        # sources
        for source in bill['sources']:
            duplicate_sources[source['url']] += 1

        # versions
        if not bill['versions']:
            # total num of bills w/o versions
            session_d['versionless_count'] += 1
        else:
            # total num of versions
            session_d['version_count'] += len(bill['versions'])
        for doc in bill['versions']:
            duplicate_versions[doc['url']] += 1
        # TODO: add a duplicate documents back in?

    dup_version_urls = []
    dup_source_urls = []
    for url, n in duplicate_versions.iteritems():
        if n > 1:
            dup_version_urls.append(url)
    for url, n in duplicate_sources.iteritems():
        if n > 1:
            dup_source_urls.append(url)

    return {'duplicate_versions': dup_version_urls,
            'duplicate_sources': dup_source_urls,
            'other_actions': other_actions.items(),
            'uncategorized_subjects': uncategorized_subjects.items(),
            'sessions': sessions,
           }
Exemple #33
0
def import_bill(data, votes, categorizer):
    level = data['level']
    abbr = data[level]

    # clean up bill_ids
    data['bill_id'] = fix_bill_id(data['bill_id'])
    if 'alternate_bill_ids' in data:
        data['alternate_bill_ids'] = [fix_bill_id(bid) for bid in
                                      data['alternate_bill_ids']]

    # move subjects to scraped_subjects
    # NOTE: intentionally doesn't copy blank lists of subjects
    # this avoids the problem where a bill is re-run but we can't
    # get subjects anymore (quite common)
    subjects = data.pop('subjects', None)
    if subjects:
        data['scraped_subjects'] = subjects

    # update categorized subjects
    if categorizer:
        categorizer.categorize_bill(data)

    # this is a hack added for Rhode Island where we can't
    # determine the full bill_id, if this key is in the metadata
    # we just use the numeric portion, not ideal as it won't work
    # in states where HB/SBs overlap, but in RI they never do
    if metadata(abbr).get('_partial_vote_bill_id'):
        # pull off numeric portion of bill_id
        numeric_bill_id = data['bill_id'].split()[1]
        bill_votes = votes.pop((data['chamber'], data['session'],
                                numeric_bill_id), [])
    else:
        # add loaded votes to data
        bill_votes = votes.pop((data['chamber'], data['session'],
                                data['bill_id']), [])

    data['votes'].extend(bill_votes)

    bill = db.bills.find_one({'level': level, level: abbr,
                              'session': data['session'],
                              'chamber': data['chamber'],
                              'bill_id': data['bill_id']})

    # keep vote/doc ids consistent
    vote_matcher = VoteMatcher(abbr)
    doc_matcher = DocumentMatcher(abbr)
    if bill:
        vote_matcher.learn_ids(bill['votes'])
        doc_matcher.learn_ids(bill['versions'] + bill['documents'])
    vote_matcher.set_ids(data['votes'])
    doc_matcher.set_ids(data['versions'] + data['documents'])

    # match sponsor leg_ids
    for sponsor in data['sponsors']:
        id = get_legislator_id(abbr, data['session'], None,
                               sponsor['name'])
        sponsor['leg_id'] = id
        if id is None:
            cid = get_committee_id(level, abbr, data['chamber'], sponsor['name'])
            if not cid is None:
                sponsor['committee_id'] = cid

    # process votes
    for vote in data['votes']:

        # committee_ids
        if 'committee' in vote:
            committee_id = get_committee_id(level, abbr, vote['chamber'],
                                            vote['committee'])
            vote['committee_id'] = committee_id

        # vote leg_ids
        for vtype in ('yes_votes', 'no_votes', 'other_votes'):
            svlist = []
            for svote in vote[vtype]:
                id = get_legislator_id(abbr, data['session'],
                                       vote['chamber'], svote)
                svlist.append({'name': svote, 'leg_id': id})

            vote[vtype] = svlist

    # process actions
    dates = {'first': None, 'last': None, 'passed_upper': None,
             'passed_lower': None, 'signed': None}
    for action in data['actions']:

        # We'll try to recover some Committee IDs here.
        if "committee" in action:
            cid = get_committee_id(level, abbr, data['chamber'],
                                   action['committee'])
            action['_scraped_committee_name'] = action['committee']
            if cid is not None:
                action['committee'] = cid
            else:
                del(action['committee'])

        adate = action['date']

        # first & last
        if not dates['first'] or adate < dates['first']:
            dates['first'] = adate
        elif not dates['last'] or adate > dates['last']:
            dates['last'] = adate

        # passed & signed
        if (not dates['passed_upper'] and action['actor'] == 'upper'
            and 'bill:passed' in action['type']):
            dates['passed_upper'] = adate
        elif (not dates['passed_lower'] and action['actor'] == 'lower'
            and 'bill:passed' in action['type']):
            dates['passed_lower'] = adate
        elif (not dates['signed'] and 'governor:signed' in action['type']):
            dates['signed'] = adate

    # save action dates to data
    data['action_dates'] = dates

    data['_term'] = term_for_session(abbr, data['session'])

    alt_titles = set(data.get('alternate_titles', []))

    for version in data['versions']:
        # push versions to oyster
        if settings.ENABLE_OYSTER and 'url' in version:
            oysterize_version(data, version)

        # Merge any version titles into the alternate_titles list
        if 'title' in version:
            alt_titles.add(version['title'])
        if '+short_title' in version:
            alt_titles.add(version['+short_title'])
    try:
        # Make sure the primary title isn't included in the
        # alternate title list
        alt_titles.remove(data['title'])
    except KeyError:
        pass
    data['alternate_titles'] = list(alt_titles)

    if not bill:
        bill_id = insert_with_id(data)
        denormalize_votes(data, bill_id)
        return "insert"
    else:
        update(bill, data, db.bills)
        denormalize_votes(data, bill['_id'])
        return "update"
Exemple #34
0
def vote_csv(state, session, chamber, out=sys.stdout):
    term = utils.term_for_session(state, session)

    votes = {}
    legislators = {}

    elemMatch = {'state': state, 'chamber': chamber,
                 'type': 'member', 'term': term}

    for leg in db.legislators.find({'$or':
                                    [{'roles': {'$elemMatch': elemMatch}},
                                     {('old_roles.%s' % term):
                                      {'$elemMatch': elemMatch}}]}):
        votes[leg['leg_id']] = []
        legislators[leg['leg_id']] = leg

    for bill in db.bills.find({'state': state, 'chamber': chamber,
                               'session': session}):
        for vote in bill['votes']:
            if 'committee' in vote and vote['committee']:
                continue
            if vote['chamber'] != chamber:
                continue

            seen = set()

            for yv in vote['yes_votes']:
                leg_id = yv['leg_id']
                if leg_id:
                    seen.add(leg_id)
                    try:
                        votes[leg_id].append(1)
                    except KeyError:
                        continue
            for nv in vote['no_votes']:
                leg_id = nv['leg_id']
                if leg_id:
                    seen.add(leg_id)
                    try:
                        votes[leg_id].append(6)
                    except KeyError:
                        continue

            for leg_id in set(votes.keys()) - seen:
                votes[leg_id].append(9)

    out = csv.writer(out)

    for (leg_id, vs) in votes.iteritems():
        leg = legislators[leg_id]

        try:
            party = leg['old_roles'][term][0]['party']
        except KeyError:
            party = leg['party']

        row = [leg['full_name'].encode('ascii', 'replace'), leg['leg_id'],
               party]
        for vote in vs:
            row.append(str(vote))

        out.writerow(row)
Exemple #35
0
def vote_csv(state, session, chamber, out=sys.stdout):
    term = utils.term_for_session(state, session)

    votes = {}
    legislators = {}

    elemMatch = {
        'state': state,
        'chamber': chamber,
        'type': 'member',
        'term': term
    }

    for leg in db.legislators.find({
            '$or': [{
                'roles': {
                    '$elemMatch': elemMatch
                }
            }, {
                ('old_roles.%s' % term): {
                    '$elemMatch': elemMatch
                }
            }]
    }):
        votes[leg['leg_id']] = []
        legislators[leg['leg_id']] = leg

    for bill in db.bills.find({
            'state': state,
            'chamber': chamber,
            'session': session
    }):
        for vote in bill['votes']:
            if 'committee' in vote and vote['committee']:
                continue
            if vote['chamber'] != chamber:
                continue

            seen = set()

            for yv in vote['yes_votes']:
                leg_id = yv['leg_id']
                if leg_id:
                    seen.add(leg_id)
                    try:
                        votes[leg_id].append(1)
                    except KeyError:
                        continue
            for nv in vote['no_votes']:
                leg_id = nv['leg_id']
                if leg_id:
                    seen.add(leg_id)
                    try:
                        votes[leg_id].append(6)
                    except KeyError:
                        continue

            for leg_id in set(votes.keys()) - seen:
                votes[leg_id].append(9)

    out = csv.writer(out)

    for (leg_id, vs) in votes.iteritems():
        leg = legislators[leg_id]

        try:
            party = leg['old_roles'][session][0]['party']
        except KeyError:
            party = leg['party']

        row = [
            leg['full_name'].encode('ascii', 'replace'), leg['leg_id'], party
        ]
        for vote in vs:
            row.append(str(vote))

        out.writerow(row)
Exemple #36
0
def import_bill(data, standalone_votes, categorizer):
    """
        insert or update a bill

        data - raw bill JSON
        standalone_votes - votes scraped separately
        categorizer - SubjectCategorizer (None - no categorization)
    """
    abbr = data[settings.LEVEL_FIELD]

    # clean up bill_ids
    data['bill_id'] = fix_bill_id(data['bill_id'])
    if 'alternate_bill_ids' in data:
        data['alternate_bill_ids'] = [
            fix_bill_id(bid) for bid in data['alternate_bill_ids']
        ]

    # move subjects to scraped_subjects
    # NOTE: intentionally doesn't copy blank lists of subjects
    # this avoids the problem where a bill is re-run but we can't
    # get subjects anymore (quite common)
    subjects = data.pop('subjects', None)
    if subjects:
        data['scraped_subjects'] = subjects

    # update categorized subjects
    if categorizer:
        categorizer.categorize_bill(data)

    # companions
    for companion in data['companions']:
        companion['bill_id'] = fix_bill_id(companion['bill_id'])
        # query based on companion
        spec = companion.copy()
        spec[settings.LEVEL_FIELD] = abbr
        if not spec['chamber']:
            spec.pop('chamber')
        companion_obj = db.bills.find_one(spec)
        if companion_obj:
            companion['internal_id'] = companion_obj['_id']
        else:
            logger.warning(
                'Unknown companion: {chamber} {session} {bill_id}'.format(
                    **companion))

    # look for a prior version of this bill
    bill = db.bills.find_one({
        settings.LEVEL_FIELD: abbr,
        'session': data['session'],
        'chamber': data['chamber'],
        'bill_id': data['bill_id']
    })

    # keep doc ids consistent
    doc_matcher = DocumentMatcher(abbr)
    if bill:
        doc_matcher.learn_ids(bill['versions'] + bill['documents'])
    doc_matcher.set_ids(data['versions'] + data['documents'])

    # match sponsor leg_ids
    match_sponsor_ids(abbr, data)

    # process votes ############

    # pull votes off bill
    bill_votes = data.pop('votes', [])

    # grab the external bill votes if present
    if metadata(abbr).get('_partial_vote_bill_id'):
        # this is a hack initially added for Rhode Island where we can't
        # determine the full bill_id, if this key is in the metadata
        # we just use the numeric portion, not ideal as it won't work
        # where HB/SBs overlap, but in RI they never do
        # pull off numeric portion of bill_id
        numeric_bill_id = data['bill_id'].split()[1]
        bill_votes += standalone_votes.pop(
            (data['chamber'], data['session'], numeric_bill_id), [])
    else:
        # add loaded votes to data
        bill_votes += standalone_votes.pop(
            (data['chamber'], data['session'], data['bill_id']), [])

    # do id matching and other vote prep
    if bill:
        prepare_votes(abbr, data['session'], bill['_id'], bill_votes)
    else:
        prepare_votes(abbr, data['session'], None, bill_votes)

    # process actions ###########

    dates = {
        'first': None,
        'last': None,
        'passed_upper': None,
        'passed_lower': None,
        'signed': None
    }

    vote_flags = {
        "bill:passed", "bill:failed", "bill:veto_override:passed",
        "bill:veto_override:failed", "amendment:passed", "amendment:failed",
        "committee:passed", "committee:passed:favorable",
        "committee:passed:unfavorable", "committee:passed:failed"
    }
    already_linked = set()
    remove_vote = set()

    for action in data['actions']:
        adate = action['date']

        def _match_committee(name):
            return get_committee_id(abbr, action['actor'], name)

        def _match_legislator(name):
            return get_legislator_id(abbr, data['session'], action['actor'],
                                     name)

        resolvers = {
            "committee": _match_committee,
            "legislator": _match_legislator
        }

        if "related_entities" in action:
            for entity in action['related_entities']:
                try:
                    resolver = resolvers[entity['type']]
                except KeyError as e:
                    # We don't know how to deal.
                    logger.error("I don't know how to sort a %s" % e)
                    continue

                id = resolver(entity['name'])
                entity['id'] = id

        # first & last dates
        if not dates['first'] or adate < dates['first']:
            dates['first'] = adate
        if not dates['last'] or adate > dates['last']:
            dates['last'] = adate

        # passed & signed dates
        if (not dates['passed_upper'] and action['actor'] == 'upper'
                and 'bill:passed' in action['type']):
            dates['passed_upper'] = adate
        elif (not dates['passed_lower'] and action['actor'] == 'lower'
              and 'bill:passed' in action['type']):
            dates['passed_lower'] = adate
        elif (not dates['signed'] and 'governor:signed' in action['type']):
            dates['signed'] = adate

        # vote-action matching
        action_attached = False
        # only attempt vote matching if action has a date and is one of the
        # designated vote action types
        if set(action['type']).intersection(vote_flags) and action['date']:
            for vote in bill_votes:
                if not vote['date']:
                    continue

                delta = abs(vote['date'] - action['date'])
                if (delta < datetime.timedelta(hours=20)
                        and vote['chamber'] == action['actor']):
                    if action_attached:
                        # multiple votes match, we can't guess
                        action.pop('related_votes', None)
                    else:
                        related_vote = vote['vote_id']
                        if related_vote in already_linked:
                            remove_vote.add(related_vote)

                        already_linked.add(related_vote)
                        action['related_votes'] = [related_vote]
                        action_attached = True

    # remove related_votes that we linked to multiple actions
    for action in data['actions']:
        for vote in remove_vote:
            if vote in action.get('related_votes', []):
                action['related_votes'].remove(vote)

    # save action dates to data
    data['action_dates'] = dates

    data['_term'] = term_for_session(abbr, data['session'])

    alt_titles = set(data.get('alternate_titles', []))

    for version in data['versions']:
        # Merge any version titles into the alternate_titles list
        if 'title' in version:
            alt_titles.add(version['title'])
        if '+short_title' in version:
            alt_titles.add(version['+short_title'])
    try:
        # Make sure the primary title isn't included in the
        # alternate title list
        alt_titles.remove(data['title'])
    except KeyError:
        pass
    data['alternate_titles'] = list(alt_titles)
    data = apply_filters(filters, data)

    if not bill:
        insert_with_id(data)
        elasticsearch_push(data)
        git_add_bill(data)
        save_votes(data, bill_votes)
        return "insert"
    else:
        update(bill, data, db.bills)
        elasticsearch_push(bill)
        git_add_bill(bill)
        save_votes(bill, bill_votes)
        return "update"
Exemple #37
0
def import_bill(data, votes, categorizer):
    level = data['level']
    abbr = data[level]

    # clean up bill_ids
    data['bill_id'] = fix_bill_id(data['bill_id'])
    if 'alternate_bill_ids' in data:
        data['alternate_bill_ids'] = [
            fix_bill_id(bid) for bid in data['alternate_bill_ids']
        ]

    # move subjects to scraped_subjects
    # NOTE: intentionally doesn't copy blank lists of subjects
    # this avoids the problem where a bill is re-run but we can't
    # get subjects anymore (quite common)
    subjects = data.pop('subjects', None)
    if subjects:
        data['scraped_subjects'] = subjects

    # update categorized subjects
    if categorizer:
        categorizer.categorize_bill(data)

    # this is a hack added for Rhode Island where we can't
    # determine the full bill_id, if this key is in the metadata
    # we just use the numeric portion, not ideal as it won't work
    # in states where HB/SBs overlap, but in RI they never do
    if metadata(abbr).get('_partial_vote_bill_id'):
        # pull off numeric portion of bill_id
        numeric_bill_id = data['bill_id'].split()[1]
        bill_votes = votes.pop(
            (data['chamber'], data['session'], numeric_bill_id), [])
    else:
        # add loaded votes to data
        bill_votes = votes.pop(
            (data['chamber'], data['session'], data['bill_id']), [])

    data['votes'].extend(bill_votes)

    bill = db.bills.find_one({
        'level': level,
        level: abbr,
        'session': data['session'],
        'chamber': data['chamber'],
        'bill_id': data['bill_id']
    })

    # keep vote/doc ids consistent
    vote_matcher = VoteMatcher(abbr)
    doc_matcher = DocumentMatcher(abbr)
    if bill:
        vote_matcher.learn_ids(bill['votes'])
        doc_matcher.learn_ids(bill['versions'] + bill['documents'])
    vote_matcher.set_ids(data['votes'])
    doc_matcher.set_ids(data['versions'] + data['documents'])

    # match sponsor leg_ids
    for sponsor in data['sponsors']:
        id = get_legislator_id(abbr, data['session'], None, sponsor['name'])
        sponsor['leg_id'] = id

    for vote in data['votes']:

        # committee_ids
        if 'committee' in vote:
            committee_id = get_committee_id(level, abbr, vote['chamber'],
                                            vote['committee'])
            vote['committee_id'] = committee_id

        # vote leg_ids
        for vtype in ('yes_votes', 'no_votes', 'other_votes'):
            svlist = []
            for svote in vote[vtype]:
                id = get_legislator_id(abbr, data['session'], vote['chamber'],
                                       svote)
                svlist.append({'name': svote, 'leg_id': id})

            vote[vtype] = svlist

    data['_term'] = term_for_session(abbr, data['session'])

    alt_titles = set(data.get('alternate_titles', []))

    for version in data['versions']:
        # push versions to oyster
        if settings.ENABLE_OYSTER and 'url' in version:
            oysterize_version(data, version)

        # Merge any version titles into the alternate_titles list
        if 'title' in version:
            alt_titles.add(version['title'])
        if '+short_title' in version:
            alt_titles.add(version['+short_title'])
    try:
        # Make sure the primary title isn't included in the
        # alternate title list
        alt_titles.remove(data['title'])
    except KeyError:
        pass
    data['alternate_titles'] = list(alt_titles)

    if not bill:
        insert_with_id(data)
        return "insert"
    else:
        update(bill, data, db.bills)
        return "update"