def main(): abbrs = sys.argv[1:] or [x['abbreviation'] for x in db.metadata.find()] logger = logging.getLogger('billy.purge_committee_ids') logger.setLevel(logging.INFO) tally = defaultdict(Counter) for abbr in abbrs: abbr_tally = tally['abbr'] spec = { settings.LEVEL_FIELD: abbr, 'related_bills': { '$exists': True, '$ne': [] }, } for event in db.events.find(spec): fixed = [] for bill in event['related_bills']: bill_id = bill.get('bill_id') if bill_id is not None: # If "bill_id" is a big id, rename it. if re.match(r'[A-Z]{2}B\d{8}', bill_id): _id = bill.pop('bill_id') bill['id'] = _id logger.info('Renamed "bill_id" to "id"') abbr_tally['bill_id --> id'] += 1 # If it's something else, do fix_bill_id to # fix screwed up old ids. else: bill['bill_id'] = fix_bill_id(bill['bill_id']) logger.info('Fixed an un-fixed bill_id') abbr_tally['fix_bill_id'] += 1 fixed = True if '_scraped_bill_id' in bill: bill_id = fix_bill_id(bill.pop('_scraped_bill_id')) bill['bill_id'] = bill_id logger.info('Renamed "_scraped_bill_id" to "bill_id"') abbr_tally['_scraped_bill_id --> bill_id'] += 1 fixed = True if fixed: msg = 'Updating related_bills on event %r.' logger.debug(msg % event['_id']) db.events.save(event) logger.info(abbr)
def main(): abbrs = sys.argv[1:] or [x['abbreviation'] for x in db.metadata.find()] logger = logging.getLogger('billy.purge_committee_ids') logger.setLevel(logging.INFO) tally = defaultdict(Counter) for abbr in abbrs: abbr_tally = tally['abbr'] spec = { settings.LEVEL_FIELD: abbr, 'related_bills': {'$exists': True, '$ne': []}, } for event in db.events.find(spec): fixed = [] for bill in event['related_bills']: bill_id = bill.get('bill_id') if bill_id is not None: # If "bill_id" is a big id, rename it. if re.match(r'[A-Z]{2}B\d{8}', bill_id): _id = bill.pop('bill_id') bill['id'] = _id logger.info('Renamed "bill_id" to "id"') abbr_tally['bill_id --> id'] += 1 # If it's something else, do fix_bill_id to # fix screwed up old ids. else: bill['bill_id'] = fix_bill_id(bill['bill_id']) logger.info('Fixed an un-fixed bill_id') abbr_tally['fix_bill_id'] += 1 fixed = True if '_scraped_bill_id' in bill: bill_id = fix_bill_id(bill.pop('_scraped_bill_id')) bill['bill_id'] = bill_id logger.info('Renamed "_scraped_bill_id" to "bill_id"') abbr_tally['_scraped_bill_id --> bill_id'] += 1 fixed = True if fixed: msg = 'Updating related_bills on event %r.' logger.debug(msg % event['_id']) db.events.save(event) logger.info(abbr)
def main(): import sys abbr = sys.argv[1] logger = logging.getLogger('purge_committee_ids') spec = {settings.LEVEL_FIELD: abbr} committee_ids = [c['_id'] for c in db.committees.find(spec, fields=['_id'])] # Events with committee participants. spec = { settings.LEVEL_FIELD: abbr, 'participants.committee_id': {'$nin': committee_ids} } for event in db.events.find(spec): found = False for participant in event['participants']: for id_key in 'committee_id', 'id': _id = participant.get(id_key, None) type_ = participant.get('participant_type') if id_key == 'id' and type_ != 'committee': continue if _id and (_id not in committee_ids): found = True msg = 'Removing participant %r from event %r' logger.info(msg % (participant['committee_id'], event['_id'])) event['participants'].remove(participant) if found: pass # import ipdb;ipdb.set_trace() # Bill actions. spec = { settings.LEVEL_FIELD: abbr, 'actions.related_entities.type': 'committee' } for bill in db.bills.find(spec): # pprint.pprint(bill['actions']) found = False for action in bill['actions']: for entity in action['related_entities']: if entity['type'] == 'committee': if entity['id'] not in committee_ids: found = True msg = 'Removing entity %r from action in %r' logger.info(msg % (entity['id'], bill['bill_id'])) action['related_entities'].remove(entity) if found: pass
def main(): import sys abbrs = sys.argv[1:] or [x['abbreviation'] for x in db.metadata.find()] logger = logging.getLogger('purge_committee_ids') logger.setLevel(logging.DEBUG) for abbr in abbrs: spec = {settings.LEVEL_FIELD: abbr} committee_ids = [ c['_id'] for c in db.committees.find(spec, fields=['_id']) ] # Events with committee participants. spec = { settings.LEVEL_FIELD: abbr, 'participants.committee_id': { '$nin': committee_ids } } for event in db.events.find(spec): old_ids = set() count = 0 found = False for participant in event['participants']: for id_key in 'committee_id', 'id': _id = participant.get(id_key, None) type_ = participant.get('participant_type') if id_key == 'id' and type_ != 'committee': continue if _id and (_id not in committee_ids): found = True msg = 'Removing participant %r from event %r' logger.info(msg % (participant[id_key], event['_id'])) # Leave the participant in but set their id to none. # Text will still be displayed without a hyperlink. participant[id_key] = None if found: msg = 'Removed %d old committee %r ids from %r' logger.info(msg % (count, old_ids, event['_id'])) db.events.save(event, safe=True) # Related committees in bill actions. spec = { settings.LEVEL_FIELD: abbr, 'actions.related_entities.type': 'committee' } for bill in db.bills.find(spec): old_ids = set() count = 0 found = False for action in bill['actions']: for entity in action['related_entities']: if entity['type'] == 'committee': if entity['id'] and (entity['id'] not in committee_ids): found = True count += 1 old_ids.add(entity['id']) msg = 'Removing entity %r from action in %r' logger.debug(msg % (entity['id'], bill['bill_id'])) # Completely remove the related entity. Without an # id it has no other purpose. action['related_entities'].remove(entity) if found: msg = 'Removed %d old committee %r ids from %r' logger.info(msg % (count, old_ids, bill['_id'])) db.bills.save(bill, safe=True) # Legislator old roles. spec = {settings.LEVEL_FIELD: abbr, 'old_roles': {'$exists': True}} for leg in db.legislators.find(spec): old_ids = set() count = 0 found = False for role in leg['old_roles']: if 'committee_id' in role: _id = role['committee_id'] if _id and (_id not in committee_ids): found = True count += 1 old_ids.add(_id) msg = 'Removing id %r from old_role in %r' logger.info(msg % (role['committee_id'], leg['full_name'])) # Set the id to None. role['committee_id'] = None if found: msg = 'Removed %d old committee %r ids from %r' logger.info(msg % (count, old_ids, leg['_id'])) db.legislators.save(leg, safe=True) # Related entities in feeds. spec = {settings.LEVEL_FIELD: abbr, 'entity_ids': {'$ne': None}} for entry in feeds_db.entries.find(spec): old_ids = set() count = 0 found = False for entity_id in entry['entity_ids']: if entity_id[2] == 'C': if entity_id not in committee_ids: found = True count += 1 msg = 'Removing id %r from feed %r' logger.info(msg % (entity_id, entry['_id'])) # Delete the entity from the feed. old_ids.add(entity_id) index = entry['entity_ids'].index(entity_id) del entry['entity_ids'][index] del entry['entity_strings'][index] if found: msg = 'Removed %d old committee ids %r from %r' logger.info(msg % (count, old_ids, entry['_id'])) feeds_db.entries.save(entry, safe=True) # Nuke any committee sponsors of bills. spec = { settings.LEVEL_FIELD: abbr, 'sponsors.committee_id': { '$nin': committee_ids } } for bill in db.bills.find(spec): count = 0 found = False old_ids = set() for sponsor in bill.get('sponsors', []): if 'committee_id' in sponsor: _id = sponsor['committee_id'] old_ids.add(_id) found = True count += 1 del sponsor['committee_id'] if found: msg = 'Removed %d old committee ids %r from %r' logger.info(msg % (count, old_ids, bill['_id'])) db.bills.save(bill)
def main(): import sys abbrs = sys.argv[1:] or [x['abbreviation'] for x in db.metadata.find()] logger = logging.getLogger('purge_committee_ids') logger.setLevel(logging.DEBUG) for abbr in abbrs: spec = {settings.LEVEL_FIELD: abbr} committee_ids = [c['_id'] for c in db.committees.find(spec, fields=['_id'])] # Events with committee participants. spec = { settings.LEVEL_FIELD: abbr, 'participants.committee_id': {'$nin': committee_ids} } for event in db.events.find(spec): old_ids = set() count = 0 found = False for participant in event['participants']: for id_key in 'committee_id', 'id': _id = participant.get(id_key, None) type_ = participant.get('participant_type') if id_key == 'id' and type_ != 'committee': continue if _id and (_id not in committee_ids): found = True msg = 'Removing participant %r from event %r' logger.info(msg % (participant[id_key], event['_id'])) # Leave the participant in but set their id to none. # Text will still be displayed without a hyperlink. participant[id_key] = None if found: msg = 'Removed %d old committee %r ids from %r' logger.info(msg % (count, old_ids, event['_id'])) db.events.save(event, safe=True) # Related committees in bill actions. spec = { settings.LEVEL_FIELD: abbr, 'actions.related_entities.type': 'committee' } for bill in db.bills.find(spec): old_ids = set() count = 0 found = False for action in bill['actions']: for entity in action['related_entities']: if entity['type'] == 'committee': if entity['id'] and (entity['id'] not in committee_ids): found = True count += 1 old_ids.add(entity['id']) msg = 'Removing entity %r from action in %r' logger.debug(msg % (entity['id'], bill['bill_id'])) # Completely remove the related entity. Without an # id it has no other purpose. action['related_entities'].remove(entity) if found: msg = 'Removed %d old committee %r ids from %r' logger.info(msg % (count, old_ids, bill['_id'])) db.bills.save(bill, safe=True) # Legislator old roles. spec = { settings.LEVEL_FIELD: abbr, 'old_roles': {'$exists': True} } for leg in db.legislators.find(spec): old_ids = set() count = 0 found = False for role in leg['old_roles']: if 'committee_id' in role: _id = role['committee_id'] if _id and (_id not in committee_ids): found = True count += 1 old_ids.add(_id) msg = 'Removing id %r from old_role in %r' logger.info(msg % (role['committee_id'], leg['full_name'])) # Set the id to None. role['committee_id'] = None if found: msg = 'Removed %d old committee %r ids from %r' logger.info(msg % (count, old_ids, leg['_id'])) db.legislators.save(leg, safe=True) # Related entities in feeds. spec = { settings.LEVEL_FIELD: abbr, 'entity_ids': {'$ne': None} } for entry in feeds_db.entries.find(spec): old_ids = set() count = 0 found = False for entity_id in entry['entity_ids']: if entity_id[2] == 'C': if entity_id not in committee_ids: found = True count += 1 msg = 'Removing id %r from feed %r' logger.info(msg % (entity_id, entry['_id'])) # Delete the entity from the feed. old_ids.add(entity_id) index = entry['entity_ids'].index(entity_id) del entry['entity_ids'][index] del entry['entity_strings'][index] if found: msg = 'Removed %d old committee ids %r from %r' logger.info(msg % (count, old_ids, entry['_id'])) feeds_db.entries.save(entry, safe=True) # Nuke any committee sponsors of bills. spec = { settings.LEVEL_FIELD: abbr, 'sponsors.committee_id': {'$nin': committee_ids} } for bill in db.bills.find(spec): count = 0 found = False old_ids = set() for sponsor in bill.get('sponsors', []): if 'committee_id' in sponsor: _id = sponsor['committee_id'] old_ids.add(_id) found = True count += 1 del sponsor['committee_id'] if found: msg = 'Removed %d old committee ids %r from %r' logger.info(msg % (count, old_ids, bill['_id'])) db.bills.save(bill)
import os import sys from os.path import dirname, abspath, join import shutil from billy.core import logging from models import Feed from entities import Extractor if __name__ == '__main__': level = logging.DEBUG logging.getLogger('billy.feed-model').setLevel(level) logging.getLogger('billy.entry-model').setLevel(level) logging.getLogger('billu.extractor').setLevel(level) # The path where the news/blogs code and urls files are located. PATH = dirname(abspath(__file__)) # filenames = os.listdir(join(PATH, 'urls')) filenames = filter(lambda s: '~' not in s, filenames) for urls_filename in filenames: abbr = urls_filename.lower().replace('.txt', '') # If abbrs are specified on the command line, scrape only those. if sys.argv[1:] and (abbr not in sys.argv[1:]): continue
class Feed(object): '''This model handles fetching the rss feed and recording any errors that occur for post-mortem reporting. It also has an instance-level report dictionary that gets augmented each time one of the feed's entries is scanned for relevant entities. ''' request_defaults = dict(requests_per_minute=0, cache_write_only=False) session = scrapelib.Scraper(**_request_defaults(request_defaults)) logger = logging.getLogger('billy.feed-model') def __init__(self, url): self.url = url self.succeeded = None self.default_report = { 'entries': { 'count': 0, 'new': 0, 'old': 0, 'relevant': 0, }, 'entities': { 'count': 0, } } self.report = { 'url': url, # The info is stored under the jurisdiction key # to avoid over-writing data for feeds with national scope that # are scanned for multiple jursidictions. For example: 'ex': self.default_report } # Delete example data. self.report['ex'] self._initial_save() def _initial_save(self): '''Perform the initial save (to get us the mongo_id if none exists yet. ''' spec = dict(url=self.url) update = {'$set': spec} self.logger.info('feed._initial_save %r' % self.url) doc = feeds_db.feeds.find_and_modify(spec, update, upsert=True) self.mongo_id = doc['_id'] def _get_feed(self): '''Try to fetch the feed and parse it. If the fetch fails, log the exception. Finally, update the report with details of the success/failure of the fetch. ''' self.logger.info('feed GET %r' % self.url) try: text = self.session.get(self.url).text except Exception: tb = traceback.format_exc() self._handle_fetch_exception(tb) return self.succeeded = True # XXX: This will fail if the link doesn't point to a valid feed. data = feedparser.parse(text) self._data = data self._update_report_after_fetch() return data @property def data(self): '''The parsed feed contents. ''' data = getattr(self, '_data', None) return data or self._get_feed() def _handle_fetch_exception(self, _traceback): '''If the fetch fails, log the exception and store the traceback for the report. ''' self.traceback = _traceback self.logger.exception(_traceback) self.succeeded = False def _update_report_after_fetch(self): '''Update the feed's report with whether the fetch operation succeeded or failed, including a formatted traceback if it failed. ''' last_fetch = { 'succeeded': self.succeeded, 'datetime': datetime.datetime.utcnow() } if not self.succeeded: last_fetch['traceback'] = self.traceback report = {'url': self.url, 'last_fetch': last_fetch} self.report.update(report) def entries(self): '''A generator of wrapped entries for this feed. ''' for entry in self.data['entries']: yield Entry(entry, feed=self) def serializable(self): '''Returns metadata about the feed (url, etc) and report information that can be saved in mongo. ''' return {'$set': self.report} def finish_report(self): ''' ''' def save(self): ''' ''' spec = dict(url=self.url) feeds_db.feeds.find_and_modify(spec, self.serializable(), upsert=True) self.logger.info('feed.save: %r' % self.url)
class Entry(object): '''Wrap a parsed feed entry dictionary thingy from feedparser. ''' request_defaults = dict(requests_per_minute=0, cache_write_only=False) session = scrapelib.Scraper(**_request_defaults(request_defaults)) logger = logging.getLogger('billy.entry-model') def __init__(self, entry, feed): self.entry = entry self.feed = feed self.report = {} # Whether a fetch of the full text was tried and succeeded. self.tried = False self.succeeded = None def mongo_id(self): '''Get a unique mongo id based on this entry's url and title. ''' s = self.entry['link'] + self.entry['title'] return hashlib.md5(s).hexdigest() def is_new(self): '''Guess whether this entry is new (i.e., previously unseen) or old. ''' mongo_id = self.mongo_id() if feeds_db.entries.find_one(mongo_id) is None: is_new = True else: is_new = False self.logger.info('is_new? %r --> %r' % (mongo_id, is_new)) def _get_full_text(self): '''Just for experimenting at this point. Fetch the full text, log any exception the occurs, and store the details regarding the outcome of the fetch on the object. ''' self.logger.info('entry GET %r' % self.entry.link) try: html = self.session.get(self.entry.link).text except Exception: tb = traceback.format_exc() self._handle_fetch_exception(tb) return self.succeeded = True self.tried = True self.html = html self._update_report_after_fetch() return html def _handle_fetch_exception(self, _traceback): '''If the fetch failed, log the failre and store the traceback object for the report. ''' self.traceback = _traceback self.logger.exception(_traceback) self.succeeded = False def _update_report_after_fetch(self): '''After fetching the entry's full text (if at all), update the entry's report with the outcome of the fetch operation, including a traceback if it failed. ''' report = {'url': self.url, 'entity_count': len(self['entity_ids'])} if self.tried: last_fetch = { 'succeeded': self.succeeded, 'datetime': datetime.datetime.utcnow() } if not self.succeeded: last_fetch['traceback'] = self.traceback report.update(last_fetch=last_fetch) self.report.update(report) def serializable(self): '''Replace date objects with datetime objects that can be json serialized. ''' # Add the feed's id to make the entry and its feed joinable. ret = dict(feed_id=self.feed.mongo_id) # Convert unserializable timestructs into datetimes. for k, v in self.entry.items(): if isinstance(v, time.struct_time): t = time.mktime(self.entry[k]) dt = datetime.datetime.fromtimestamp(t) ret[k] = dt return ret def save_if_entities_found(self): '''If the entry is previously unseen and the extractor finds entities have been mentioned, save, otherwise do nothing. ''' if self.is_new() and self.entry['entity_ids']: feeds_db.entries.save(self.serializable()) self.logger('entry.save_if_entities_found: %r' % self.entry.link) def finish_report(self, abbr): '''After attempting to extract entities, update the report and the report of this entry's feed with relevant information. Two things happen in this function: the entry's report gets updated, and the report object on the entry's feed gets updated. The feed's default report for a jurisdiction has this basic shape: { 'entries': { 'count': 0, 'new': 0, 'old': 0, 'relevant': 0, }, 'entities': { 'count' : 0, } } `abbr` is the jurisdiction abbreviation this info will be stored under in the feed's report object. ''' # Update the feed's report. feed_report = self.feed.report report = feed_report.get(abbr, self.feed.default_report) report['entries']['count'] += 1 # If this is a new entry... if self.tried: report['entries']['new'] += 1 if self.entry['entity_ids']: report['entries']['relevant'] += 1 report['entities']['count'] += len(self.entry['entity_ids']) else: report['entries']['old'] += 1
class Feed(object): '''This model handles fetching the rss feed and recording any errors that occur for post-mortem reporting. It also has an instance-level report dictionary that gets augmented each time one of the feed's entries is scanned for relevant entities. ''' request_defaults = dict( cache_obj=FileCache(FEEDS_CACHE), requests_per_minute=0, cache_write_only=False) session = scrapelib.Scraper( **_request_defaults(request_defaults)) logger = logging.getLogger('billy.feed-model') def __init__(self, url, jurisdiction): self.url = url self.jurisdiction = jurisdiction self.succeeded = None self.default_report = { 'entries': { 'count': 0, 'new': 0, 'old': 0, 'relevant': 0, }, 'entities': { 'count' : 0, } } self.report = { 'url': url, # The info is stored under the jurisdiction key # to avoid over-writing data for feeds with national scope that # are scanned for multiple jursidictions. jurisdiction: self.default_report } # Make sure this feed has a mongo id. self._initial_save() @staticmethod def blast_cache(self): '''Remove the scrapelib.Scraper fastmode cache for feed retrieval. Done before a scrape, but not before multiple jurisdictions in a single run, in case a feed of national scope needs to get processed for each state. ''' shutil.rmtree(FEEDS_CACHE) def _initial_save(self): '''Perform the initial save (to get us the mongo_id if none exists yet. ''' spec = dict(url=self.url) update = {'$set': spec} self.logger.debug('feed._initial_save %r' % self.url) doc = feeds_db.feeds.find_and_modify( spec, update, upsert=True, new=True) self.mongo_id = doc['_id'] def _get_feed(self): '''Try to fetch the feed and parse it. If the fetch fails, log the exception. Finally, update the report with details of the success/failure of the fetch. ''' try: text = self.session.get(self.url).text except Exception: tb = traceback.format_exc() self._handle_fetch_exception(tb) self._update_report_after_fetch() else: self.succeeded = True # XXX: This will fail if the text isn't a valid feed. data = feedparser.parse(text) self._data = data self._update_report_after_fetch() return data @property def data(self): '''The parsed feed contents. ''' data = getattr(self, '_data', None) return data or self._get_feed() or {} def is_valid(self): '''Does this hot garbage contain the keys we expect? ''' return 'title' in self.data.get('feed', {}) def _handle_fetch_exception(self, _traceback): '''If the fetch fails, log the exception and store the traceback for the report. ''' self.traceback = _traceback self.logger.exception(_traceback) self.succeeded = False def _update_report_after_fetch(self): '''Update the feed's report with whether the fetch operation succeeded or failed, including a formatted traceback if it failed. ''' last_fetch = { 'succeeded': self.succeeded, 'datetime': datetime.datetime.utcnow() } if not self.succeeded: last_fetch['traceback'] = self.traceback self.report[self.jurisdiction].update(last_fetch=last_fetch) def entries(self): '''A generator of wrapped entries for this feed. ''' data = self.data or {} entries = data.get('entries', []) for entry in entries: yield Entry(entry, feed=self) def serializable(self): '''Returns metadata about the feed (url, etc) and report information that can be saved in mongo. ''' return {'$set': self.report} def finish_report(self): '''Extra stuff to go in the report goes here. ''' def save(self): '''Update the feed record with the latest report. ''' if not self.is_valid(): return spec = dict(url=self.url) update = {'$set': self.report} self.logger.debug('feed.finish_report %r' % self.url) feeds_db.feeds.find_and_modify(spec, update, upsert=True, new=True) self.logger.info('feed.save: %r' % self.url)