def main(): abbrs = sys.argv[1:] or [x['abbreviation'] for x in db.metadata.find()] logger = logging.getLogger('billy.purge_committee_ids') logger.setLevel(logging.INFO) tally = defaultdict(Counter) for abbr in abbrs: abbr_tally = tally['abbr'] spec = { settings.LEVEL_FIELD: abbr, 'related_bills': { '$exists': True, '$ne': [] }, } for event in db.events.find(spec): fixed = [] for bill in event['related_bills']: bill_id = bill.get('bill_id') if bill_id is not None: # If "bill_id" is a big id, rename it. if re.match(r'[A-Z]{2}B\d{8}', bill_id): _id = bill.pop('bill_id') bill['id'] = _id logger.info('Renamed "bill_id" to "id"') abbr_tally['bill_id --> id'] += 1 # If it's something else, do fix_bill_id to # fix screwed up old ids. else: bill['bill_id'] = fix_bill_id(bill['bill_id']) logger.info('Fixed an un-fixed bill_id') abbr_tally['fix_bill_id'] += 1 fixed = True if '_scraped_bill_id' in bill: bill_id = fix_bill_id(bill.pop('_scraped_bill_id')) bill['bill_id'] = bill_id logger.info('Renamed "_scraped_bill_id" to "bill_id"') abbr_tally['_scraped_bill_id --> bill_id'] += 1 fixed = True if fixed: msg = 'Updating related_bills on event %r.' logger.debug(msg % event['_id']) db.events.save(event) logger.info(abbr)
def main(): abbrs = sys.argv[1:] or [x['abbreviation'] for x in db.metadata.find()] logger = logging.getLogger('billy.purge_committee_ids') logger.setLevel(logging.INFO) tally = defaultdict(Counter) for abbr in abbrs: abbr_tally = tally['abbr'] spec = { settings.LEVEL_FIELD: abbr, 'related_bills': {'$exists': True, '$ne': []}, } for event in db.events.find(spec): fixed = [] for bill in event['related_bills']: bill_id = bill.get('bill_id') if bill_id is not None: # If "bill_id" is a big id, rename it. if re.match(r'[A-Z]{2}B\d{8}', bill_id): _id = bill.pop('bill_id') bill['id'] = _id logger.info('Renamed "bill_id" to "id"') abbr_tally['bill_id --> id'] += 1 # If it's something else, do fix_bill_id to # fix screwed up old ids. else: bill['bill_id'] = fix_bill_id(bill['bill_id']) logger.info('Fixed an un-fixed bill_id') abbr_tally['fix_bill_id'] += 1 fixed = True if '_scraped_bill_id' in bill: bill_id = fix_bill_id(bill.pop('_scraped_bill_id')) bill['bill_id'] = bill_id logger.info('Renamed "_scraped_bill_id" to "bill_id"') abbr_tally['_scraped_bill_id --> bill_id'] += 1 fixed = True if fixed: msg = 'Updating related_bills on event %r.' logger.debug(msg % event['_id']) db.events.save(event) logger.info(abbr)
def document(request, abbr, session, bill_id, doc_id): ''' Context: - abbr - session - bill - version - metadata - nav_active Templates: - billy/web/public/document.html ''' # get fixed version fixed_bill_id = fix_bill_id(bill_id) # redirect if URL's id isn't fixed id without spaces if fixed_bill_id.replace(' ', '') != bill_id: return redirect('document', abbr=abbr, session=session, bill_id=fixed_bill_id.replace(' ', ''), doc_id=doc_id) bill = db.bills.find_one({settings.LEVEL_FIELD: abbr, 'session': session, 'bill_id': fixed_bill_id}) for version in bill['versions']: if version['doc_id'] == doc_id: break else: raise Http404('No such document.') return render(request, templatename('document'), dict(abbr=abbr, session=session, bill=bill, version=version, metadata=bill.metadata, nav_active='bills'))
def import_events(abbr, data_dir, import_actions=False): data_dir = os.path.join(data_dir, abbr) pattern = os.path.join(data_dir, 'events', '*.json') for path in glob.iglob(pattern): with open(path) as f: data = prepare_obj(json.load(f)) def _resolve_ctty(committee): return get_committee_id(data[settings.LEVEL_FIELD], committee['chamber'], committee['participant']) def _resolve_leg(leg): chamber = leg['chamber'] if leg['chamber'] in ['upper', 'lower'] \ else None return get_legislator_id(abbr, data['session'], chamber, leg['participant']) resolvers = {"committee": _resolve_ctty, "legislator": _resolve_leg} for entity in data['participants']: type = entity['participant_type'] id = None if type in resolvers: id = resolvers[type](entity) else: logger.warning("I don't know how to resolve a %s" % type) entity['id'] = id for bill in data['related_bills']: bill['_scraped_bill_id'] = bill['bill_id'] bill_id = bill['bill_id'] bill_id = fix_bill_id(bill_id) bill['bill_id'] = "" db_bill = db.bills.find_one({ "$or": [{ settings.LEVEL_FIELD: abbr, 'session': data['session'], 'bill_id': bill_id }, { settings.LEVEL_FIELD: abbr, 'session': data['session'], 'alternate_bill_ids': bill_id }] }) if not db_bill: logger.warning("Error: Can't find %s" % bill_id) db_bill = {} db_bill['_id'] = None # Events are really hard to pin to a chamber. Some of these are # also a committee considering a bill from the other chamber, or # something like that. bill['bill_id'] = db_bill['_id'] import_event(data) ensure_indexes()
def bill(request, abbr, session, bill_id): ''' Context: - vote_preview_row_template - abbr - metadata - bill - events - show_all_sponsors - sponsors - sources - nav_active Templates: - billy/web/public/bill.html - billy/web/public/vote_preview_row.html ''' # get fixed version fixed_bill_id = fix_bill_id(bill_id) # redirect if URL's id isn't fixed id without spaces if fixed_bill_id.replace(' ', '') != bill_id: return redirect('bill', abbr=abbr, session=session, bill_id=fixed_bill_id.replace(' ', '')) _bill = db.bills.find_one({settings.LEVEL_FIELD: abbr, 'session': session, 'bill_id': fixed_bill_id}) if _bill is None: raise Http404( u'no bill found {0} {1} {2}'.format(abbr, session, bill_id)) events = db.events.find({ settings.LEVEL_FIELD: abbr, "related_bills.bill_id": _bill['_id'] }).sort("when", -1) events = list(events) if len(events) > EVENT_PAGE_COUNT: events = events[:EVENT_PAGE_COUNT] popularity.counter.inc('bills', _bill['_id'], abbr=abbr, session=session) show_all_sponsors = request.GET.get('show_all_sponsors') if show_all_sponsors: sponsors = _bill.sponsors_manager else: sponsors = _bill.sponsors_manager.first_fifteen return render( request, templatename('bill'), dict(vote_preview_row_template=templatename('vote_preview_row'), abbr=abbr, metadata=Metadata.get_object(abbr), bill=_bill, events=events, show_all_sponsors=show_all_sponsors, sponsors=sponsors, sources=_bill['sources'], nav_active='bills'))
def bill(request, abbr, session, bill_id): ''' Context: - vote_preview_row_template - abbr - metadata - bill - events - show_all_sponsors - sponsors - sources - nav_active Templates: - billy/web/public/bill.html - billy/web/public/vote_preview_row.html ''' # get fixed version fixed_bill_id = fix_bill_id(bill_id) # redirect if URL's id isn't fixed id without spaces if fixed_bill_id.replace(' ', '') != bill_id: return redirect('bill', abbr=abbr, session=session, bill_id=fixed_bill_id.replace(' ', '')) bill = db.bills.find_one({settings.LEVEL_FIELD: abbr, 'session': session, 'bill_id': fixed_bill_id}) if bill is None: raise Http404(u'no bill found {0} {1} {2}'.format(abbr, session, bill_id)) events = db.events.find({ settings.LEVEL_FIELD: abbr, "related_bills.bill_id": bill['_id'] }).sort("when", -1) events = list(events) if len(events) > EVENT_PAGE_COUNT: events = events[:EVENT_PAGE_COUNT] popularity.counter.inc('bills', bill['_id'], abbr=abbr, session=session) show_all_sponsors = request.GET.get('show_all_sponsors') if show_all_sponsors: sponsors = bill.sponsors_manager else: sponsors = bill.sponsors_manager.first_fifteen return render( request, templatename('bill'), dict(vote_preview_row_template=templatename('vote_preview_row'), abbr=abbr, metadata=Metadata.get_object(abbr), bill=bill, events=events, show_all_sponsors=show_all_sponsors, sponsors=sponsors, sources=bill['sources'], nav_active='bills'))
def bill(request, abbr, session, bill_id): """ Context: - vote_preview_row_template - abbr - metadata - bill - events - show_all_sponsors - sponsors - sources - nav_active Templates: - billy/web/public/bill.html - billy/web/public/vote_preview_row.html """ # get fixed version fixed_bill_id = fix_bill_id(bill_id) # redirect if URL's id isn't fixed id without spaces if fixed_bill_id.replace(" ", "") != bill_id: return redirect("bill", abbr=abbr, session=session, bill_id=fixed_bill_id.replace(" ", "")) bill = db.bills.find_one({settings.LEVEL_FIELD: abbr, "session": session, "bill_id": fixed_bill_id}) if bill is None: raise Http404(u"no bill found {0} {1} {2}".format(abbr, session, bill_id)) events = db.events.find({settings.LEVEL_FIELD: abbr, "related_bills.bill_id": bill["_id"]}).sort("when", -1) events = list(events) if len(events) > EVENT_PAGE_COUNT: events = events[:EVENT_PAGE_COUNT] popularity.counter.inc("bills", bill["_id"], abbr=abbr, session=session) show_all_sponsors = request.GET.get("show_all_sponsors") if show_all_sponsors: sponsors = bill.sponsors_manager else: sponsors = bill.sponsors_manager.first_fifteen return render( request, templatename("bill"), dict( vote_preview_row_template=templatename("vote_preview_row"), abbr=abbr, metadata=Metadata.get_object(abbr), bill=bill, events=events, show_all_sponsors=show_all_sponsors, sponsors=sponsors, sources=bill["sources"], nav_active="bills", ), )
def func(request, abbr, session, bill_id, key): # get fixed version fixed_bill_id = fix_bill_id(bill_id) # redirect if URL's id isn't fixed id without spaces if fixed_bill_id.replace(" ", "") != bill_id: return redirect("bill", abbr=abbr, session=session, bill_id=fixed_bill_id.replace(" ", "")) bill = db.bills.find_one({settings.LEVEL_FIELD: abbr, "session": session, "bill_id": fixed_bill_id}) if bill is None: raise Http404("no bill found {0} {1} {2}".format(abbr, session, bill_id)) return render( request, templatename("bill_all_%s" % key), dict(abbr=abbr, metadata=Metadata.get_object(abbr), bill=bill, sources=bill["sources"], nav_active="bills"), )
def import_events(abbr, data_dir, import_actions=False): data_dir = os.path.join(data_dir, abbr) pattern = os.path.join(data_dir, "events", "*.json") for path in glob.iglob(pattern): with open(path) as f: data = prepare_obj(json.load(f)) def _resolve_ctty(committee): return get_committee_id(data[settings.LEVEL_FIELD], committee["chamber"], committee["participant"]) def _resolve_leg(leg): chamber = leg["chamber"] if leg["chamber"] in ["upper", "lower"] else None return get_legislator_id(abbr, data["session"], chamber, leg["participant"]) resolvers = {"committee": _resolve_ctty, "legislator": _resolve_leg} for entity in data["participants"]: type = entity["participant_type"] id = None if type in resolvers: id = resolvers[type](entity) else: logger.warning("I don't know how to resolve a %s" % type) entity["id"] = id for bill in data["related_bills"]: bill_id = bill["bill_id"] bill_id = fix_bill_id(bill_id) db_bill = db.bills.find_one( { "$or": [ {settings.LEVEL_FIELD: abbr, "session": data["session"], "bill_id": bill_id}, {settings.LEVEL_FIELD: abbr, "session": data["session"], "alternate_bill_ids": bill_id}, ] } ) if not db_bill: logger.warning("Error: Can't find %s" % bill_id) db_bill = {} db_bill["_id"] = None # Events are really hard to pin to a chamber. Some of these are # also a committee considering a bill from the other chamber, or # something like that. bill["id"] = db_bill["_id"] bill["bill_id"] = bill_id import_event(data)
def document(request, abbr, session, bill_id, doc_id): ''' Context: - abbr - session - bill - version - metadata - nav_active Templates: - billy/web/public/document.html ''' # get fixed version fixed_bill_id = fix_bill_id(bill_id) # redirect if URL's id isn't fixed id without spaces if fixed_bill_id.replace(' ', '') != bill_id: return redirect('document', abbr=abbr, session=session, bill_id=fixed_bill_id.replace(' ', ''), doc_id=doc_id) bill = db.bills.find_one({ settings.LEVEL_FIELD: abbr, 'session': session, 'bill_id': fixed_bill_id }) if not bill: raise Http404('No such bill.') for version in bill['versions']: if version['doc_id'] == doc_id: break else: raise Http404('No such document.') if not settings.ENABLE_DOCUMENT_VIEW.get(abbr, False): return redirect(version['url']) return render( request, templatename('document'), dict(abbr=abbr, session=session, bill=bill, version=version, metadata=bill.metadata, nav_active='bills'))
def func(request, abbr, session, bill_id, key): # get fixed version fixed_bill_id = fix_bill_id(bill_id) # redirect if URL's id isn't fixed id without spaces if fixed_bill_id.replace(' ', '') != bill_id: return redirect('bill', abbr=abbr, session=session, bill_id=fixed_bill_id.replace(' ', '')) bill = db.bills.find_one({settings.LEVEL_FIELD: abbr, 'session': session, 'bill_id': fixed_bill_id}) if bill is None: raise Http404('no bill found {0} {1} {2}'.format(abbr, session, bill_id)) return render(request, templatename('bill_all_%s' % key), dict(abbr=abbr, metadata=Metadata.get_object(abbr), bill=bill, sources=bill['sources'], nav_active='bills'))
def load_standalone_votes(data_dir): pattern = os.path.join(data_dir, 'votes', '*.json') paths = glob.glob(pattern) votes = defaultdict(list) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) # need to match bill_id already in the database bill_id = fix_bill_id(data.pop('bill_id')) votes[(data['bill_chamber'], data['session'], bill_id)].append(data) logger.info('imported %s vote files' % len(paths)) return votes
def bill(request, abbr, session, bill_id): ''' Context: - vote_preview_row_template - abbr - metadata - bill - show_all_sponsors - sponsors - sources - nav_active Templates: - billy/web/public/bill.html - billy/web/public/vote_preview_row.html ''' # get fixed version fixed_bill_id = fix_bill_id(bill_id) # redirect if URL's id isn't fixed id without spaces if fixed_bill_id.replace(' ', '') != bill_id: return redirect('bill', abbr=abbr, session=session, bill_id=fixed_bill_id.replace(' ', '')) bill = db.bills.find_one({settings.LEVEL_FIELD: abbr, 'session': session, 'bill_id': fixed_bill_id}) if bill is None: raise Http404(u'no bill found {0} {1} {2}'.format(abbr, session, bill_id)) show_all_sponsors = request.GET.get('show_all_sponsors') if show_all_sponsors: sponsors = bill.sponsors_manager else: sponsors = bill.sponsors_manager.first_fifteen return render( request, templatename('bill'), dict(vote_preview_row_template=templatename('vote_preview_row'), abbr=abbr, metadata=Metadata.get_object(abbr), bill=bill, show_all_sponsors=show_all_sponsors, sponsors=sponsors, sources=bill['sources'], nav_active='bills'))
def _build_mongo_filter(request, keys, icase=True): _filter = {} keys = set(keys) - set(['fields']) for key in keys: value = request.GET.get(key) if value: if key in _lower_fields: _filter[key] = value.lower() elif key.endswith('__in'): values = value.split('|') _filter[key[:-4]] = values elif key == 'bill_id': _filter[key] = fix_bill_id(value.upper()) else: # We use regex queries to get case insensitive search - this # means they won't use any indexes for now. Real case # insensitive queries are coming eventually: # http://jira.mongodb.org/browse/SERVER-90 _filter[key] = re.compile('^%s$' % value, re.IGNORECASE) return _filter
def document(request, abbr, session, bill_id, doc_id): """ Context: - abbr - session - bill - version - metadata - nav_active Templates: - billy/web/public/document.html """ # get fixed version fixed_bill_id = fix_bill_id(bill_id) # redirect if URL's id isn't fixed id without spaces if fixed_bill_id.replace(" ", "") != bill_id: return redirect("document", abbr=abbr, session=session, bill_id=fixed_bill_id.replace(" ", ""), doc_id=doc_id) bill = db.bills.find_one({settings.LEVEL_FIELD: abbr, "session": session, "bill_id": fixed_bill_id}) if not bill: raise Http404("No such bill.") for version in bill["versions"]: if version["doc_id"] == doc_id: break else: raise Http404("No such document.") if not settings.ENABLE_DOCUMENT_VIEW.get(abbr, False): return redirect(version["url"]) return render( request, templatename("document"), dict(abbr=abbr, session=session, bill=bill, version=version, metadata=bill.metadata, nav_active="bills"), )
def search_by_bill_id(abbr, search_text): '''Find bills with ids like "HB1234". ''' spec = {} # If the input looks like a bill id, try to fetch the bill. if re.search(r'\d', search_text): bill_id = fix_bill_id(search_text).upper() collection = db.bills spec.update(bill_id=bill_id) if abbr != 'all': spec[settings.LEVEL_FIELD] = abbr docs = collection.find(spec) # Do a regex search if the input consists solely of digits. if 0 == docs.count(): spec['bill_id'] = {'$regex': bill_id} docs = collection.find(spec) # If there were actual results, return a bill_id result view. if 0 < docs.count(): def sortkey(doc): session = doc['session'] years = re.findall(r'\d{4}', session) try: return int(years[-1]) except IndexError: return session docs = sorted(docs, key=operator.itemgetter('session'), reverse=True) return docs
def import_bill(data, standalone_votes, categorizer): """ insert or update a bill data - raw bill JSON standalone_votes - votes scraped separately categorizer - SubjectCategorizer (None - no categorization) """ abbr = data[settings.LEVEL_FIELD] # clean up bill_ids data['bill_id'] = fix_bill_id(data['bill_id']) if 'alternate_bill_ids' in data: data['alternate_bill_ids'] = [ fix_bill_id(bid) for bid in data['alternate_bill_ids'] ] # move subjects to scraped_subjects # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common) subjects = data.pop('subjects', None) if subjects: data['scraped_subjects'] = subjects # update categorized subjects if categorizer: categorizer.categorize_bill(data) # companions for companion in data['companions']: companion['bill_id'] = fix_bill_id(companion['bill_id']) # query based on companion spec = companion.copy() spec[settings.LEVEL_FIELD] = abbr if not spec['chamber']: spec.pop('chamber') companion_obj = db.bills.find_one(spec) if companion_obj: companion['internal_id'] = companion_obj['_id'] else: logger.warning( 'Unknown companion: {chamber} {session} {bill_id}'.format( **companion)) # look for a prior version of this bill bill = db.bills.find_one({ settings.LEVEL_FIELD: abbr, 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id'] }) # keep doc ids consistent doc_matcher = DocumentMatcher(abbr) if bill: doc_matcher.learn_ids(bill['versions'] + bill['documents']) doc_matcher.set_ids(data['versions'] + data['documents']) # match sponsor leg_ids match_sponsor_ids(abbr, data) # process votes ############ # pull votes off bill bill_votes = data.pop('votes', []) # grab the external bill votes if present if metadata(abbr).get('_partial_vote_bill_id'): # this is a hack initially added for Rhode Island where we can't # determine the full bill_id, if this key is in the metadata # we just use the numeric portion, not ideal as it won't work # where HB/SBs overlap, but in RI they never do # pull off numeric portion of bill_id numeric_bill_id = data['bill_id'].split()[1] bill_votes += standalone_votes.pop( (data['chamber'], data['session'], numeric_bill_id), []) else: # add loaded votes to data bill_votes += standalone_votes.pop( (data['chamber'], data['session'], data['bill_id']), []) # do id matching and other vote prep if bill: prepare_votes(abbr, data['session'], bill['_id'], bill_votes) else: prepare_votes(abbr, data['session'], None, bill_votes) # process actions ########### dates = { 'first': None, 'last': None, 'passed_upper': None, 'passed_lower': None, 'signed': None } vote_flags = { "bill:passed", "bill:failed", "bill:veto_override:passed", "bill:veto_override:failed", "amendment:passed", "amendment:failed", "committee:passed", "committee:passed:favorable", "committee:passed:unfavorable", "committee:passed:failed" } already_linked = set() remove_vote = set() for action in data['actions']: adate = action['date'] def _match_committee(name): return get_committee_id(abbr, action['actor'], name) def _match_legislator(name): return get_legislator_id(abbr, data['session'], action['actor'], name) resolvers = { "committee": _match_committee, "legislator": _match_legislator } if "related_entities" in action: for entity in action['related_entities']: try: resolver = resolvers[entity['type']] except KeyError as e: # We don't know how to deal. logger.error("I don't know how to sort a %s" % e) continue id = resolver(entity['name']) entity['id'] = id # first & last dates if not dates['first'] or adate < dates['first']: dates['first'] = adate if not dates['last'] or adate > dates['last']: dates['last'] = adate # passed & signed dates if (not dates['passed_upper'] and action['actor'] == 'upper' and 'bill:passed' in action['type']): dates['passed_upper'] = adate elif (not dates['passed_lower'] and action['actor'] == 'lower' and 'bill:passed' in action['type']): dates['passed_lower'] = adate elif (not dates['signed'] and 'governor:signed' in action['type']): dates['signed'] = adate # vote-action matching action_attached = False # only attempt vote matching if action has a date and is one of the # designated vote action types if set(action['type']).intersection(vote_flags) and action['date']: for vote in bill_votes: if not vote['date']: continue delta = abs(vote['date'] - action['date']) if (delta < datetime.timedelta(hours=20) and vote['chamber'] == action['actor']): if action_attached: # multiple votes match, we can't guess action.pop('related_votes', None) else: related_vote = vote['vote_id'] if related_vote in already_linked: remove_vote.add(related_vote) already_linked.add(related_vote) action['related_votes'] = [related_vote] action_attached = True # remove related_votes that we linked to multiple actions for action in data['actions']: for vote in remove_vote: if vote in action.get('related_votes', []): action['related_votes'].remove(vote) # save action dates to data data['action_dates'] = dates data['_term'] = term_for_session(abbr, data['session']) alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: # Merge any version titles into the alternate_titles list if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) data = apply_filters(filters, data) if not bill: insert_with_id(data) elasticsearch_push(data) git_add_bill(data) save_votes(data, bill_votes) return "insert" else: update(bill, data, db.bills) elasticsearch_push(bill) git_add_bill(bill) save_votes(bill, bill_votes) return "update"
def search(query=None, abbr=None, chamber=None, subjects=None, bill_id=None, bill_id__in=None, search_window=None, updated_since=None, sponsor_id=None, bill_fields=None, status=None, type_=None, session=None): _filter = {} for key, value in [(settings.LEVEL_FIELD, abbr), ('chamber', chamber), ('subjects', subjects), ('bill_id', bill_id), ]: if value is not None: _filter[key] = value if search_window: if search_window == 'session': _filter['_current_session'] = True elif search_window == 'term': _filter['_current_term'] = True elif search_window.startswith('session:'): _filter['session'] = search_window.split('session:')[1] elif search_window.startswith('term:'): _filter['_term'] = search_window.split('term:')[1] elif search_window == 'all': pass else: raise ValueError('invalid search_window. valid choices are ' ' "term", "session", "all"') if updated_since: try: _filter['updated_at'] = {'$gte': parse_param_dt(updated_since)} except ValueError: raise ValueError('invalid updated_since parameter. ' 'please supply date in YYYY-MM-DD format') if sponsor_id: _filter['sponsors.leg_id'] = sponsor_id if status: # Status is slightly different: it's a dict like-- # {'action_dates.signed': {'$ne': None}} _filter.update(**status) if type_: _filter['type'] = type_ if session: _filter['session'] = session # process full-text query if query and settings.ENABLE_ELASTICSEARCH: # block spammers, possibly move to a BANNED_SEARCH_LIST setting if '<a href' in query: return db.bills.find({settings.LEVEL_FIELD: None}) # if query is numeric convert to an id filter if re.findall('\d+', query): _id_filter = dict(_filter) # if query is entirely numeric make it a regex if not re.findall('\D', query): _id_filter['bill_id'] = {'$regex': fix_bill_id(query).upper()} else: _id_filter['bill_id'] = fix_bill_id(query).upper() # check for a result result = db.bills.find(_id_filter, fields=bill_fields) if result.count(): return result query = {"query_string": {"fields": ["text", "title"], "default_operator": "AND", "query": query}} search = pyes.Search(query, fields=[]) # take terms from mongo query es_terms = [] if settings.LEVEL_FIELD in _filter: es_terms.append(pyes.TermFilter( settings.LEVEL_FIELD, _filter.pop(settings.LEVEL_FIELD))) if 'session' in _filter: es_terms.append(pyes.TermFilter('session', _filter.pop('session'))) if 'chamber' in _filter: es_terms.append(pyes.TermFilter('chamber', _filter.pop('chamber'))) if 'subjects' in _filter: es_terms.append(pyes.TermFilter( 'subjects', _filter.pop('subjects')['$all'])) if 'sponsors.leg_id' in _filter: es_terms.append(pyes.TermFilter( 'sponsors', _filter.pop('sponsors.leg_id'))) # add terms if es_terms: search.filter = pyes.ANDFilter(es_terms) # page size is a guess, could use tweaks es_result = elasticsearch.search(search, search_type='scan', scroll='3m', size=250) doc_ids = [r.get_id() for r in es_result] _filter['versions.doc_id'] = {'$in': doc_ids} elif query: _filter['title'] = {'$regex': query, '$options': 'i'} # return query return db.bills.find(_filter, fields=bill_fields)
def search(query=None, abbr=None, chamber=None, subjects=None, bill_id=None, search_window=None, updated_since=None, last_action_since=None, sponsor_id=None, status=None, type_=None, session=None, bill_fields=None, sort=None, limit=None): use_elasticsearch = False numeric_query = False mongo_filter = {} es_terms = [] if status is None: status = [] if query: use_elasticsearch = settings.ENABLE_ELASTICSEARCH # spammers get a 400 if '<a href' in query: raise PermissionDenied('html detected') # if query is numeric convert to an id filter # (TODO: maybe this should be an $or) if re.findall('\d+', query): # if query is entirely numeric make it a regex and hit mongo if not re.findall('\D', query): mongo_filter['bill_id'] = { '$regex': fix_bill_id(query).upper() } else: mongo_filter['bill_id'] = fix_bill_id(query).upper() use_elasticsearch = False numeric_query = True # handle abbr if abbr and use_elasticsearch: es_terms.append({'term': {'jurisdiction': abbr}}) elif abbr: mongo_filter[settings.LEVEL_FIELD] = abbr # sponsor_id if sponsor_id and use_elasticsearch: es_terms.append({'term': {'sponsor_ids': sponsor_id}}) elif sponsor_id: mongo_filter['sponsors.leg_id'] = sponsor_id # handle simple term arguments (chamber, bill_id, type, session) if isinstance(bill_id, list) and not use_elasticsearch: bill_id = {'$in': bill_id} simple_args = { 'chamber': chamber, 'bill_id': bill_id, 'type': type_, 'session': session } if search_window: if search_window == 'session': simple_args['_current_session'] = True elif search_window == 'term': simple_args['_current_term'] = True elif search_window.startswith('session:'): simple_args['session'] = search_window.split('session:')[1] elif search_window.startswith('term:'): simple_args['_term'] = search_window.split('term:')[1] elif search_window != 'all': raise ValueError('invalid search_window. valid choices are ' ' "term", "session", "all"') for key, value in simple_args.iteritems(): if value is not None: if use_elasticsearch: es_terms.append({'term': {key: value}}) else: mongo_filter[key] = value if subjects and use_elasticsearch: for subject in subjects: es_terms.append({'term': {'subjects': subject}}) elif subjects: mongo_filter['subjects'] = {'$all': filter(None, subjects)} if updated_since and use_elasticsearch: es_terms.append({'range': {'updated_at': {'gte': updated_since}}}) elif updated_since: try: mongo_filter['updated_at'] = { '$gte': parse_param_dt(updated_since) } except ValueError: raise ValueError('invalid updated_since parameter. ' 'please supply date in YYYY-MM-DD format') if last_action_since and use_elasticsearch: es_terms.append( {'range': { 'action_dates.last': { 'gte': last_action_since } }}) elif last_action_since: try: mongo_filter['action_dates.last'] = { '$gte': parse_param_dt(last_action_since) } except ValueError: raise ValueError('invalid last_action_since parameter. ' 'please supply date in YYYY-MM-DD format') # Status comes in as a list and needs to become: # {'action_dates.signed': {'$ne': None}} status_spec = [] for _status in status: status_spec.append({'action_dates.%s' % _status: {'$ne': None}}) if len(status_spec) == 1: status_spec = status_spec[0] elif len(status_spec) > 1: status_spec = {'$and': status_spec} if status_spec and use_elasticsearch: for key in status: es_terms.append({'exists': {'field': key}}) elif status_spec: mongo_filter.update(**status_spec) # preprocess sort if sort in ('first', 'last', 'signed', 'passed_lower', 'passed_upper'): sort = 'action_dates.' + sort elif sort not in ('updated_at', 'created_at'): sort = 'action_dates.last' # do the actual ES query if query and use_elasticsearch: search = { 'query': { "query_string": { "fields": ["text", "title"], "default_operator": "AND", "query": query } } } if es_terms: search['filter'] = {'and': es_terms} search = {'query': {'filtered': search}} search['fields'] = [] return BillSearchResults(search, None, sort, bill_fields) elif query and not numeric_query: mongo_filter['title'] = {'$regex': query, '$options': 'i'} return BillSearchResults(None, mongo_filter, sort, bill_fields)
def import_bill(data, standalone_votes, categorizer): """ insert or update a bill data - raw bill JSON standalone_votes - votes scraped separately categorizer - SubjectCategorizer (None - no categorization) """ abbr = data[settings.LEVEL_FIELD] # clean up bill_ids data['bill_id'] = fix_bill_id(data['bill_id']) if 'alternate_bill_ids' in data: data['alternate_bill_ids'] = [fix_bill_id(bid) for bid in data['alternate_bill_ids']] # move subjects to scraped_subjects # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common) subjects = data.pop('subjects', None) if subjects: data['scraped_subjects'] = subjects # update categorized subjects if categorizer: categorizer.categorize_bill(data) # companions for companion in data['companions']: companion['bill_id'] = fix_bill_id(companion['bill_id']) # query based on companion spec = companion.copy() spec[settings.LEVEL_FIELD] = abbr if not spec['chamber']: spec.pop('chamber') companion_obj = db.bills.find_one(spec) if companion_obj: companion['internal_id'] = companion_obj['_id'] else: logger.warning('Unknown companion: {chamber} {session} {bill_id}' .format(**companion)) # look for a prior version of this bill bill = db.bills.find_one({settings.LEVEL_FIELD: abbr, 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id']}) # keep doc ids consistent doc_matcher = DocumentMatcher(abbr) if bill: doc_matcher.learn_ids(bill['versions'] + bill['documents']) doc_matcher.set_ids(data['versions'] + data['documents']) # match sponsor leg_ids match_sponsor_ids(abbr, data) # process votes ############ # pull votes off bill bill_votes = data.pop('votes', []) # grab the external bill votes if present if metadata(abbr).get('_partial_vote_bill_id'): # this is a hack initially added for Rhode Island where we can't # determine the full bill_id, if this key is in the metadata # we just use the numeric portion, not ideal as it won't work # where HB/SBs overlap, but in RI they never do # pull off numeric portion of bill_id numeric_bill_id = data['bill_id'].split()[1] bill_votes += standalone_votes.pop((data['chamber'], data['session'], numeric_bill_id), []) else: # add loaded votes to data bill_votes += standalone_votes.pop((data['chamber'], data['session'], data['bill_id']), []) # do id matching and other vote prep if bill: prepare_votes(abbr, data['session'], bill['_id'], bill_votes) else: prepare_votes(abbr, data['session'], None, bill_votes) # process actions ########### dates = {'first': None, 'last': None, 'passed_upper': None, 'passed_lower': None, 'signed': None} vote_flags = { "bill:passed", "bill:failed", "bill:veto_override:passed", "bill:veto_override:failed", "amendment:passed", "amendment:failed", "committee:passed", "committee:passed:favorable", "committee:passed:unfavorable", "committee:passed:failed" } already_linked = set() remove_vote = set() for action in data['actions']: adate = action['date'] def _match_committee(name): return get_committee_id(abbr, action['actor'], name) def _match_legislator(name): return get_legislator_id(abbr, data['session'], action['actor'], name) resolvers = { "committee": _match_committee, "legislator": _match_legislator } if "related_entities" in action: for entity in action['related_entities']: try: resolver = resolvers[entity['type']] except KeyError as e: # We don't know how to deal. logger.error("I don't know how to sort a %s" % e) continue id = resolver(entity['name']) entity['id'] = id # first & last dates if not dates['first'] or adate < dates['first']: dates['first'] = adate if not dates['last'] or adate > dates['last']: dates['last'] = adate # passed & signed dates if (not dates['passed_upper'] and action['actor'] == 'upper' and 'bill:passed' in action['type']): dates['passed_upper'] = adate elif (not dates['passed_lower'] and action['actor'] == 'lower' and 'bill:passed' in action['type']): dates['passed_lower'] = adate elif (not dates['signed'] and 'governor:signed' in action['type']): dates['signed'] = adate # vote-action matching action_attached = False # only attempt vote matching if action has a date and is one of the # designated vote action types if set(action['type']).intersection(vote_flags) and action['date']: for vote in bill_votes: if not vote['date']: continue delta = abs(vote['date'] - action['date']) if (delta < datetime.timedelta(hours=20) and vote['chamber'] == action['actor']): if action_attached: # multiple votes match, we can't guess action.pop('related_votes', None) else: related_vote = vote['vote_id'] if related_vote in already_linked: remove_vote.add(related_vote) already_linked.add(related_vote) action['related_votes'] = [related_vote] action_attached = True # remove related_votes that we linked to multiple actions for action in data['actions']: for vote in remove_vote: if vote in action.get('related_votes', []): action['related_votes'].remove(vote) # save action dates to data data['action_dates'] = dates data['_term'] = term_for_session(abbr, data['session']) alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: # add/update tracked_versions collection track_version(data, version) # Merge any version titles into the alternate_titles list if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) data = apply_filters(filters, data) if not bill: insert_with_id(data) git_add_bill(data) save_votes(data, bill_votes) return "insert" else: update(bill, data, db.bills) git_add_bill(bill) save_votes(bill, bill_votes) return "update"
def search(query=None, abbr=None, chamber=None, subjects=None, bill_id=None, bill_id__in=None, search_window=None, updated_since=None, sponsor_id=None, bill_fields=None, status=None, type_=None, session=None): _filter = {} for key, value in [ (settings.LEVEL_FIELD, abbr), ('chamber', chamber), ('subjects', subjects), ('bill_id', bill_id), ]: if value is not None: _filter[key] = value if search_window: if search_window == 'session': _filter['_current_session'] = True elif search_window == 'term': _filter['_current_term'] = True elif search_window.startswith('session:'): _filter['session'] = search_window.split('session:')[1] elif search_window.startswith('term:'): _filter['_term'] = search_window.split('term:')[1] elif search_window == 'all': pass else: raise ValueError('invalid search_window. valid choices are ' ' "term", "session", "all"') if updated_since: try: _filter['updated_at'] = {'$gte': parse_param_dt(updated_since)} except ValueError: raise ValueError('invalid updated_since parameter. ' 'please supply date in YYYY-MM-DD format') if sponsor_id: _filter['sponsors.leg_id'] = sponsor_id if status: # Status is slightly different: it's a dict like-- # {'action_dates.signed': {'$ne': None}} _filter.update(**status) if type_: _filter['type'] = type_ if session: _filter['session'] = session # process full-text query if query and settings.ENABLE_ELASTICSEARCH: # block spammers, possibly move to a BANNED_SEARCH_LIST setting if '<a href' in query: return db.bills.find({settings.LEVEL_FIELD: None}) if re.findall('\d+', query): _id_filter = dict(_filter) _id_filter['bill_id'] = fix_bill_id(query).upper() result = db.bills.find(_id_filter) if result: return result query = { "query_string": { "fields": ["text", "title"], "default_operator": "AND", "query": query } } search = pyes.Search(query, fields=[]) # take terms from mongo query es_terms = [] if settings.LEVEL_FIELD in _filter: es_terms.append( pyes.TermFilter(settings.LEVEL_FIELD, _filter.pop(settings.LEVEL_FIELD))) if 'session' in _filter: es_terms.append( pyes.TermFilter('session', _filter.pop('session'))) if 'chamber' in _filter: es_terms.append( pyes.TermFilter('chamber', _filter.pop('chamber'))) if 'subjects' in _filter: es_terms.append( pyes.TermFilter('subjects', _filter.pop('subjects')['$all'])) if 'sponsors.leg_id' in _filter: es_terms.append( pyes.TermFilter('sponsors', _filter.pop('sponsors.leg_id'))) # add terms if es_terms: search.filter = pyes.ANDFilter(es_terms) # page size is a guess, could use tweaks es_result = elasticsearch.search(search, search_type='scan', scroll='3m', size=250) doc_ids = [r.get_id() for r in es_result] _filter['versions.doc_id'] = {'$in': doc_ids} elif query: _filter['title'] = {'$regex': query, '$options': 'i'} # return query return db.bills.find(_filter, bill_fields)
def import_events(abbr, data_dir, import_actions=False): data_dir = os.path.join(data_dir, abbr) pattern = os.path.join(data_dir, 'events', '*.json') for path in glob.iglob(pattern): with open(path) as f: data = prepare_obj(json.load(f)) def _resolve_ctty(committee): return get_committee_id(data[settings.LEVEL_FIELD], committee['chamber'], committee['participant']) def _resolve_leg(leg): chamber = leg['chamber'] if leg['chamber'] in ['upper', 'lower'] \ else None return get_legislator_id(abbr, data['session'], chamber, leg['participant']) resolvers = { "committee": _resolve_ctty, "legislator": _resolve_leg } for entity in data['participants']: type = entity['participant_type'] id = None if type in resolvers: id = resolvers[type](entity) else: logger.warning("I don't know how to resolve a %s" % type) entity['id'] = id for bill in data['related_bills']: bill_id = bill['bill_id'] bill_id = fix_bill_id(bill_id) db_bill = db.bills.find_one({ "$or": [ { settings.LEVEL_FIELD: abbr, 'session': data['session'], 'bill_id': bill_id }, { settings.LEVEL_FIELD: abbr, 'session': data['session'], 'alternate_bill_ids': bill_id } ] }) if not db_bill: logger.warning("Error: Can't find %s" % bill_id) db_bill = {} db_bill['_id'] = None # Events are really hard to pin to a chamber. Some of these are # also a committee considering a bill from the other chamber, or # something like that. bill['id'] = db_bill['_id'] import_event(data) ensure_indexes()
def search(query=None, abbr=None, chamber=None, subjects=None, bill_id=None, search_window=None, updated_since=None, last_action_since=None, sponsor_id=None, status=None, type_=None, session=None, bill_fields=None, sort=None, limit=None): use_elasticsearch = False numeric_query = False mongo_filter = {} es_terms = [] if status is None: status = [] if query: use_elasticsearch = settings.ENABLE_ELASTICSEARCH # spammers get a 400 if '<a href' in query: raise PermissionDenied('html detected') # if query is numeric convert to an id filter # (TODO: maybe this should be an $or) if re.findall('\d+', query): # if query is entirely numeric make it a regex and hit mongo if not re.findall('\D', query): mongo_filter['bill_id'] = {'$regex': fix_bill_id(query).upper()} else: mongo_filter['bill_id'] = fix_bill_id(query).upper() use_elasticsearch = False numeric_query = True # handle abbr if abbr and use_elasticsearch: es_terms.append({'term': {'jurisdiction': abbr}}) elif abbr: mongo_filter[settings.LEVEL_FIELD] = abbr # sponsor_id if sponsor_id and use_elasticsearch: es_terms.append({'term': {'sponsor_ids': sponsor_id}}) elif sponsor_id: mongo_filter['sponsors.leg_id'] = sponsor_id # handle simple term arguments (chamber, bill_id, type, session) if isinstance(bill_id, list) and not use_elasticsearch: bill_id = {'$in': bill_id} simple_args = {'chamber': chamber, 'bill_id': bill_id, 'type': type_, 'session': session} if search_window: if search_window == 'session': simple_args['_current_session'] = True elif search_window == 'term': simple_args['_current_term'] = True elif search_window.startswith('session:'): simple_args['session'] = search_window.split('session:')[1] elif search_window.startswith('term:'): simple_args['_term'] = search_window.split('term:')[1] elif search_window != 'all': raise ValueError('invalid search_window. valid choices are ' ' "term", "session", "all"') for key, value in simple_args.iteritems(): if value is not None: if use_elasticsearch: es_terms.append({'term': {key: value}}) else: mongo_filter[key] = value if subjects and use_elasticsearch: for subject in subjects: es_terms.append({'term': {'subjects': subject}}) elif subjects: mongo_filter['subjects'] = {'$all': filter(None, subjects)} if updated_since and use_elasticsearch: es_terms.append({'range': {'updated_at': {'gte': updated_since}}}) elif updated_since: try: mongo_filter['updated_at'] = {'$gte': parse_param_dt(updated_since)} except ValueError: raise ValueError('invalid updated_since parameter. ' 'please supply date in YYYY-MM-DD format') if last_action_since and use_elasticsearch: es_terms.append({'range': {'action_dates.last': {'gte': last_action_since}}}) elif last_action_since: try: mongo_filter['action_dates.last'] = {'$gte': parse_param_dt(last_action_since)} except ValueError: raise ValueError('invalid last_action_since parameter. ' 'please supply date in YYYY-MM-DD format') # Status comes in as a list and needs to become: # {'action_dates.signed': {'$ne': None}} status_spec = [] for _status in status: status_spec.append({'action_dates.%s' % _status: {'$ne': None}}) if len(status_spec) == 1: status_spec = status_spec[0] elif len(status_spec) > 1: status_spec = {'$and': status_spec} if status_spec and use_elasticsearch: for key in status: es_terms.append({'exists': {'field': key}}) elif status_spec: mongo_filter.update(**status_spec) # preprocess sort if sort in ('first', 'last', 'signed', 'passed_lower', 'passed_upper'): sort = 'action_dates.' + sort elif sort not in ('updated_at', 'created_at'): sort = 'action_dates.last' # do the actual ES query if query and use_elasticsearch: search = {'query': {"query_string": {"fields": ["text", "title"], "default_operator": "AND", "query": query}}} if es_terms: search['filter'] = {'and': es_terms} search = {'query': {'filtered': search}} search['fields'] = [] return BillSearchResults(search, None, sort, bill_fields) elif query and not numeric_query: mongo_filter['title'] = {'$regex': query, '$options': 'i'} return BillSearchResults(None, mongo_filter, sort, bill_fields)