def main(): abbrs = sys.argv[1:] or [x['abbreviation'] for x in db.metadata.find()] logger = logging.getLogger('billy.purge_committee_ids') logger.setLevel(logging.INFO) tally = defaultdict(Counter) for abbr in abbrs: abbr_tally = tally['abbr'] spec = { settings.LEVEL_FIELD: abbr, 'related_bills': { '$exists': True, '$ne': [] }, } for event in fixed = [] for bill in event['related_bills']: bill_id = bill.get('bill_id') if bill_id is not None: # If "bill_id" is a big id, rename it. if re.match(r'[A-Z]{2}B\d{8}', bill_id): _id = bill.pop('bill_id') bill['id'] = _id'Renamed "bill_id" to "id"') abbr_tally['bill_id --> id'] += 1 # If it's something else, do fix_bill_id to # fix screwed up old ids. else: bill['bill_id'] = fix_bill_id(bill['bill_id'])'Fixed an un-fixed bill_id') abbr_tally['fix_bill_id'] += 1 fixed = True if '_scraped_bill_id' in bill: bill_id = fix_bill_id(bill.pop('_scraped_bill_id')) bill['bill_id'] = bill_id'Renamed "_scraped_bill_id" to "bill_id"') abbr_tally['_scraped_bill_id --> bill_id'] += 1 fixed = True if fixed: msg = 'Updating related_bills on event %r.' logger.debug(msg % event['_id'])
def main(): abbrs = sys.argv[1:] or [x['abbreviation'] for x in db.metadata.find()] logger = logging.getLogger('billy.purge_committee_ids') logger.setLevel(logging.INFO) tally = defaultdict(Counter) for abbr in abbrs: abbr_tally = tally['abbr'] spec = { settings.LEVEL_FIELD: abbr, 'related_bills': {'$exists': True, '$ne': []}, } for event in fixed = [] for bill in event['related_bills']: bill_id = bill.get('bill_id') if bill_id is not None: # If "bill_id" is a big id, rename it. if re.match(r'[A-Z]{2}B\d{8}', bill_id): _id = bill.pop('bill_id') bill['id'] = _id'Renamed "bill_id" to "id"') abbr_tally['bill_id --> id'] += 1 # If it's something else, do fix_bill_id to # fix screwed up old ids. else: bill['bill_id'] = fix_bill_id(bill['bill_id'])'Fixed an un-fixed bill_id') abbr_tally['fix_bill_id'] += 1 fixed = True if '_scraped_bill_id' in bill: bill_id = fix_bill_id(bill.pop('_scraped_bill_id')) bill['bill_id'] = bill_id'Renamed "_scraped_bill_id" to "bill_id"') abbr_tally['_scraped_bill_id --> bill_id'] += 1 fixed = True if fixed: msg = 'Updating related_bills on event %r.' logger.debug(msg % event['_id'])
def main(): import sys abbr = sys.argv[1] logger = logging.getLogger('purge_committee_ids') spec = {settings.LEVEL_FIELD: abbr} committee_ids = [c['_id'] for c in db.committees.find(spec, fields=['_id'])] # Events with committee participants. spec = { settings.LEVEL_FIELD: abbr, 'participants.committee_id': {'$nin': committee_ids} } for event in found = False for participant in event['participants']: for id_key in 'committee_id', 'id': _id = participant.get(id_key, None) type_ = participant.get('participant_type') if id_key == 'id' and type_ != 'committee': continue if _id and (_id not in committee_ids): found = True msg = 'Removing participant %r from event %r' % (participant['committee_id'], event['_id'])) event['participants'].remove(participant) if found: pass # import ipdb;ipdb.set_trace() # Bill actions. spec = { settings.LEVEL_FIELD: abbr, 'actions.related_entities.type': 'committee' } for bill in db.bills.find(spec): # pprint.pprint(bill['actions']) found = False for action in bill['actions']: for entity in action['related_entities']: if entity['type'] == 'committee': if entity['id'] not in committee_ids: found = True msg = 'Removing entity %r from action in %r' % (entity['id'], bill['bill_id'])) action['related_entities'].remove(entity) if found: pass
def main(): import sys abbrs = sys.argv[1:] or [x['abbreviation'] for x in db.metadata.find()] logger = logging.getLogger('purge_committee_ids') logger.setLevel(logging.DEBUG) for abbr in abbrs: spec = {settings.LEVEL_FIELD: abbr} committee_ids = [ c['_id'] for c in db.committees.find(spec, fields=['_id']) ] # Events with committee participants. spec = { settings.LEVEL_FIELD: abbr, 'participants.committee_id': { '$nin': committee_ids } } for event in old_ids = set() count = 0 found = False for participant in event['participants']: for id_key in 'committee_id', 'id': _id = participant.get(id_key, None) type_ = participant.get('participant_type') if id_key == 'id' and type_ != 'committee': continue if _id and (_id not in committee_ids): found = True msg = 'Removing participant %r from event %r' % (participant[id_key], event['_id'])) # Leave the participant in but set their id to none. # Text will still be displayed without a hyperlink. participant[id_key] = None if found: msg = 'Removed %d old committee %r ids from %r' % (count, old_ids, event['_id'])), safe=True) # Related committees in bill actions. spec = { settings.LEVEL_FIELD: abbr, 'actions.related_entities.type': 'committee' } for bill in db.bills.find(spec): old_ids = set() count = 0 found = False for action in bill['actions']: for entity in action['related_entities']: if entity['type'] == 'committee': if entity['id'] and (entity['id'] not in committee_ids): found = True count += 1 old_ids.add(entity['id']) msg = 'Removing entity %r from action in %r' logger.debug(msg % (entity['id'], bill['bill_id'])) # Completely remove the related entity. Without an # id it has no other purpose. action['related_entities'].remove(entity) if found: msg = 'Removed %d old committee %r ids from %r' % (count, old_ids, bill['_id'])), safe=True) # Legislator old roles. spec = {settings.LEVEL_FIELD: abbr, 'old_roles': {'$exists': True}} for leg in db.legislators.find(spec): old_ids = set() count = 0 found = False for role in leg['old_roles']: if 'committee_id' in role: _id = role['committee_id'] if _id and (_id not in committee_ids): found = True count += 1 old_ids.add(_id) msg = 'Removing id %r from old_role in %r' % (role['committee_id'], leg['full_name'])) # Set the id to None. role['committee_id'] = None if found: msg = 'Removed %d old committee %r ids from %r' % (count, old_ids, leg['_id'])), safe=True) # Related entities in feeds. spec = {settings.LEVEL_FIELD: abbr, 'entity_ids': {'$ne': None}} for entry in feeds_db.entries.find(spec): old_ids = set() count = 0 found = False for entity_id in entry['entity_ids']: if entity_id[2] == 'C': if entity_id not in committee_ids: found = True count += 1 msg = 'Removing id %r from feed %r' % (entity_id, entry['_id'])) # Delete the entity from the feed. old_ids.add(entity_id) index = entry['entity_ids'].index(entity_id) del entry['entity_ids'][index] del entry['entity_strings'][index] if found: msg = 'Removed %d old committee ids %r from %r' % (count, old_ids, entry['_id'])), safe=True) # Nuke any committee sponsors of bills. spec = { settings.LEVEL_FIELD: abbr, 'sponsors.committee_id': { '$nin': committee_ids } } for bill in db.bills.find(spec): count = 0 found = False old_ids = set() for sponsor in bill.get('sponsors', []): if 'committee_id' in sponsor: _id = sponsor['committee_id'] old_ids.add(_id) found = True count += 1 del sponsor['committee_id'] if found: msg = 'Removed %d old committee ids %r from %r' % (count, old_ids, bill['_id']))
def main(): import sys abbrs = sys.argv[1:] or [x['abbreviation'] for x in db.metadata.find()] logger = logging.getLogger('purge_committee_ids') logger.setLevel(logging.DEBUG) for abbr in abbrs: spec = {settings.LEVEL_FIELD: abbr} committee_ids = [c['_id'] for c in db.committees.find(spec, fields=['_id'])] # Events with committee participants. spec = { settings.LEVEL_FIELD: abbr, 'participants.committee_id': {'$nin': committee_ids} } for event in old_ids = set() count = 0 found = False for participant in event['participants']: for id_key in 'committee_id', 'id': _id = participant.get(id_key, None) type_ = participant.get('participant_type') if id_key == 'id' and type_ != 'committee': continue if _id and (_id not in committee_ids): found = True msg = 'Removing participant %r from event %r' % (participant[id_key], event['_id'])) # Leave the participant in but set their id to none. # Text will still be displayed without a hyperlink. participant[id_key] = None if found: msg = 'Removed %d old committee %r ids from %r' % (count, old_ids, event['_id'])), safe=True) # Related committees in bill actions. spec = { settings.LEVEL_FIELD: abbr, 'actions.related_entities.type': 'committee' } for bill in db.bills.find(spec): old_ids = set() count = 0 found = False for action in bill['actions']: for entity in action['related_entities']: if entity['type'] == 'committee': if entity['id'] and (entity['id'] not in committee_ids): found = True count += 1 old_ids.add(entity['id']) msg = 'Removing entity %r from action in %r' logger.debug(msg % (entity['id'], bill['bill_id'])) # Completely remove the related entity. Without an # id it has no other purpose. action['related_entities'].remove(entity) if found: msg = 'Removed %d old committee %r ids from %r' % (count, old_ids, bill['_id'])), safe=True) # Legislator old roles. spec = { settings.LEVEL_FIELD: abbr, 'old_roles': {'$exists': True} } for leg in db.legislators.find(spec): old_ids = set() count = 0 found = False for role in leg['old_roles']: if 'committee_id' in role: _id = role['committee_id'] if _id and (_id not in committee_ids): found = True count += 1 old_ids.add(_id) msg = 'Removing id %r from old_role in %r' % (role['committee_id'], leg['full_name'])) # Set the id to None. role['committee_id'] = None if found: msg = 'Removed %d old committee %r ids from %r' % (count, old_ids, leg['_id'])), safe=True) # Related entities in feeds. spec = { settings.LEVEL_FIELD: abbr, 'entity_ids': {'$ne': None} } for entry in feeds_db.entries.find(spec): old_ids = set() count = 0 found = False for entity_id in entry['entity_ids']: if entity_id[2] == 'C': if entity_id not in committee_ids: found = True count += 1 msg = 'Removing id %r from feed %r' % (entity_id, entry['_id'])) # Delete the entity from the feed. old_ids.add(entity_id) index = entry['entity_ids'].index(entity_id) del entry['entity_ids'][index] del entry['entity_strings'][index] if found: msg = 'Removed %d old committee ids %r from %r' % (count, old_ids, entry['_id'])), safe=True) # Nuke any committee sponsors of bills. spec = { settings.LEVEL_FIELD: abbr, 'sponsors.committee_id': {'$nin': committee_ids} } for bill in db.bills.find(spec): count = 0 found = False old_ids = set() for sponsor in bill.get('sponsors', []): if 'committee_id' in sponsor: _id = sponsor['committee_id'] old_ids.add(_id) found = True count += 1 del sponsor['committee_id'] if found: msg = 'Removed %d old committee ids %r from %r' % (count, old_ids, bill['_id']))
import os import sys from os.path import dirname, abspath, join import shutil from billy.core import logging from models import Feed from entities import Extractor if __name__ == '__main__': level = logging.DEBUG logging.getLogger('billy.feed-model').setLevel(level) logging.getLogger('billy.entry-model').setLevel(level) logging.getLogger('billu.extractor').setLevel(level) # The path where the news/blogs code and urls files are located. PATH = dirname(abspath(__file__)) # filenames = os.listdir(join(PATH, 'urls')) filenames = filter(lambda s: '~' not in s, filenames) for urls_filename in filenames: abbr = urls_filename.lower().replace('.txt', '') # If abbrs are specified on the command line, scrape only those. if sys.argv[1:] and (abbr not in sys.argv[1:]): continue
class Feed(object): '''This model handles fetching the rss feed and recording any errors that occur for post-mortem reporting. It also has an instance-level report dictionary that gets augmented each time one of the feed's entries is scanned for relevant entities. ''' request_defaults = dict(requests_per_minute=0, cache_write_only=False) session = scrapelib.Scraper(**_request_defaults(request_defaults)) logger = logging.getLogger('billy.feed-model') def __init__(self, url): self.url = url self.succeeded = None self.default_report = { 'entries': { 'count': 0, 'new': 0, 'old': 0, 'relevant': 0, }, 'entities': { 'count': 0, } } = { 'url': url, # The info is stored under the jurisdiction key # to avoid over-writing data for feeds with national scope that # are scanned for multiple jursidictions. For example: 'ex': self.default_report } # Delete example data.['ex'] self._initial_save() def _initial_save(self): '''Perform the initial save (to get us the mongo_id if none exists yet. ''' spec = dict(url=self.url) update = {'$set': spec}'feed._initial_save %r' % self.url) doc = feeds_db.feeds.find_and_modify(spec, update, upsert=True) self.mongo_id = doc['_id'] def _get_feed(self): '''Try to fetch the feed and parse it. If the fetch fails, log the exception. Finally, update the report with details of the success/failure of the fetch. ''''feed GET %r' % self.url) try: text = self.session.get(self.url).text except Exception: tb = traceback.format_exc() self._handle_fetch_exception(tb) return self.succeeded = True # XXX: This will fail if the link doesn't point to a valid feed. data = feedparser.parse(text) self._data = data self._update_report_after_fetch() return data @property def data(self): '''The parsed feed contents. ''' data = getattr(self, '_data', None) return data or self._get_feed() def _handle_fetch_exception(self, _traceback): '''If the fetch fails, log the exception and store the traceback for the report. ''' self.traceback = _traceback self.logger.exception(_traceback) self.succeeded = False def _update_report_after_fetch(self): '''Update the feed's report with whether the fetch operation succeeded or failed, including a formatted traceback if it failed. ''' last_fetch = { 'succeeded': self.succeeded, 'datetime': datetime.datetime.utcnow() } if not self.succeeded: last_fetch['traceback'] = self.traceback report = {'url': self.url, 'last_fetch': last_fetch} def entries(self): '''A generator of wrapped entries for this feed. ''' for entry in['entries']: yield Entry(entry, feed=self) def serializable(self): '''Returns metadata about the feed (url, etc) and report information that can be saved in mongo. ''' return {'$set':} def finish_report(self): ''' ''' def save(self): ''' ''' spec = dict(url=self.url) feeds_db.feeds.find_and_modify(spec, self.serializable(), upsert=True)' %r' % self.url)
class Entry(object): '''Wrap a parsed feed entry dictionary thingy from feedparser. ''' request_defaults = dict(requests_per_minute=0, cache_write_only=False) session = scrapelib.Scraper(**_request_defaults(request_defaults)) logger = logging.getLogger('billy.entry-model') def __init__(self, entry, feed): self.entry = entry self.feed = feed = {} # Whether a fetch of the full text was tried and succeeded. self.tried = False self.succeeded = None def mongo_id(self): '''Get a unique mongo id based on this entry's url and title. ''' s = self.entry['link'] + self.entry['title'] return hashlib.md5(s).hexdigest() def is_new(self): '''Guess whether this entry is new (i.e., previously unseen) or old. ''' mongo_id = self.mongo_id() if feeds_db.entries.find_one(mongo_id) is None: is_new = True else: is_new = False'is_new? %r --> %r' % (mongo_id, is_new)) def _get_full_text(self): '''Just for experimenting at this point. Fetch the full text, log any exception the occurs, and store the details regarding the outcome of the fetch on the object. ''''entry GET %r' % try: html = self.session.get( except Exception: tb = traceback.format_exc() self._handle_fetch_exception(tb) return self.succeeded = True self.tried = True self.html = html self._update_report_after_fetch() return html def _handle_fetch_exception(self, _traceback): '''If the fetch failed, log the failre and store the traceback object for the report. ''' self.traceback = _traceback self.logger.exception(_traceback) self.succeeded = False def _update_report_after_fetch(self): '''After fetching the entry's full text (if at all), update the entry's report with the outcome of the fetch operation, including a traceback if it failed. ''' report = {'url': self.url, 'entity_count': len(self['entity_ids'])} if self.tried: last_fetch = { 'succeeded': self.succeeded, 'datetime': datetime.datetime.utcnow() } if not self.succeeded: last_fetch['traceback'] = self.traceback report.update(last_fetch=last_fetch) def serializable(self): '''Replace date objects with datetime objects that can be json serialized. ''' # Add the feed's id to make the entry and its feed joinable. ret = dict(feed_id=self.feed.mongo_id) # Convert unserializable timestructs into datetimes. for k, v in self.entry.items(): if isinstance(v, time.struct_time): t = time.mktime(self.entry[k]) dt = datetime.datetime.fromtimestamp(t) ret[k] = dt return ret def save_if_entities_found(self): '''If the entry is previously unseen and the extractor finds entities have been mentioned, save, otherwise do nothing. ''' if self.is_new() and self.entry['entity_ids']: self.logger('entry.save_if_entities_found: %r' % def finish_report(self, abbr): '''After attempting to extract entities, update the report and the report of this entry's feed with relevant information. Two things happen in this function: the entry's report gets updated, and the report object on the entry's feed gets updated. The feed's default report for a jurisdiction has this basic shape: { 'entries': { 'count': 0, 'new': 0, 'old': 0, 'relevant': 0, }, 'entities': { 'count' : 0, } } `abbr` is the jurisdiction abbreviation this info will be stored under in the feed's report object. ''' # Update the feed's report. feed_report = report = feed_report.get(abbr, self.feed.default_report) report['entries']['count'] += 1 # If this is a new entry... if self.tried: report['entries']['new'] += 1 if self.entry['entity_ids']: report['entries']['relevant'] += 1 report['entities']['count'] += len(self.entry['entity_ids']) else: report['entries']['old'] += 1
class Feed(object): '''This model handles fetching the rss feed and recording any errors that occur for post-mortem reporting. It also has an instance-level report dictionary that gets augmented each time one of the feed's entries is scanned for relevant entities. ''' request_defaults = dict( cache_obj=FileCache(FEEDS_CACHE), requests_per_minute=0, cache_write_only=False) session = scrapelib.Scraper( **_request_defaults(request_defaults)) logger = logging.getLogger('billy.feed-model') def __init__(self, url, jurisdiction): self.url = url self.jurisdiction = jurisdiction self.succeeded = None self.default_report = { 'entries': { 'count': 0, 'new': 0, 'old': 0, 'relevant': 0, }, 'entities': { 'count' : 0, } } = { 'url': url, # The info is stored under the jurisdiction key # to avoid over-writing data for feeds with national scope that # are scanned for multiple jursidictions. jurisdiction: self.default_report } # Make sure this feed has a mongo id. self._initial_save() @staticmethod def blast_cache(self): '''Remove the scrapelib.Scraper fastmode cache for feed retrieval. Done before a scrape, but not before multiple jurisdictions in a single run, in case a feed of national scope needs to get processed for each state. ''' shutil.rmtree(FEEDS_CACHE) def _initial_save(self): '''Perform the initial save (to get us the mongo_id if none exists yet. ''' spec = dict(url=self.url) update = {'$set': spec} self.logger.debug('feed._initial_save %r' % self.url) doc = feeds_db.feeds.find_and_modify( spec, update, upsert=True, new=True) self.mongo_id = doc['_id'] def _get_feed(self): '''Try to fetch the feed and parse it. If the fetch fails, log the exception. Finally, update the report with details of the success/failure of the fetch. ''' try: text = self.session.get(self.url).text except Exception: tb = traceback.format_exc() self._handle_fetch_exception(tb) self._update_report_after_fetch() else: self.succeeded = True # XXX: This will fail if the text isn't a valid feed. data = feedparser.parse(text) self._data = data self._update_report_after_fetch() return data @property def data(self): '''The parsed feed contents. ''' data = getattr(self, '_data', None) return data or self._get_feed() or {} def is_valid(self): '''Does this hot garbage contain the keys we expect? ''' return 'title' in'feed', {}) def _handle_fetch_exception(self, _traceback): '''If the fetch fails, log the exception and store the traceback for the report. ''' self.traceback = _traceback self.logger.exception(_traceback) self.succeeded = False def _update_report_after_fetch(self): '''Update the feed's report with whether the fetch operation succeeded or failed, including a formatted traceback if it failed. ''' last_fetch = { 'succeeded': self.succeeded, 'datetime': datetime.datetime.utcnow() } if not self.succeeded: last_fetch['traceback'] = self.traceback[self.jurisdiction].update(last_fetch=last_fetch) def entries(self): '''A generator of wrapped entries for this feed. ''' data = or {} entries = data.get('entries', []) for entry in entries: yield Entry(entry, feed=self) def serializable(self): '''Returns metadata about the feed (url, etc) and report information that can be saved in mongo. ''' return {'$set':} def finish_report(self): '''Extra stuff to go in the report goes here. ''' def save(self): '''Update the feed record with the latest report. ''' if not self.is_valid(): return spec = dict(url=self.url) update = {'$set':} self.logger.debug('feed.finish_report %r' % self.url) feeds_db.feeds.find_and_modify(spec, update, upsert=True, new=True)' %r' % self.url)