def scrape_bills(self): """ Does the following 1) Scrapes bill data from unitedstates project and saves the data to path specified in UnitedStates module 2) Iterates over bill data and converts each one to an OCD-compliant bill model. 3) Yields the OCD-compliant bill model instance @return: yield Bill instance """ # run scraper first to pull in all the bill data self.run_unitedstates_bill_scraper() # iterate over all the files and build and yield Bill objects for filename in find_files(settings.SCRAPED_DATA_DIR, '.*[a-z]*\/[a-z]*[0-9]*\/data\.json'): try: with open(filename) as json_file: json_data = json.load(json_file) # Initialize Object bill = Bill(self.TYPE_MAP[json_data['bill_type']]['canonical'] + ' ' + json_data['number'], json_data['congress'], json_data['official_title'], chamber=self.TYPE_MAP[json_data['bill_type']]['chamber'] ) # Basics bill.type = [json_data['bill_type']] bill.subject = json_data['subjects'] bill.add_summary(json_data['summary']['as'], json_data['summary']['text'], json_data['summary']['date']) # Common Fields bill.sources = [{'url': json_data['url'], 'note': 'all'}] # Other/Related Bills bill.other_titles = [{'note': t['type'], 'title': t['title']} for t in json_data['titles']] # change value of relationship_type to 'type' field from json_data when permitted by schema bill.related_bills = [{'session': b['session'], 'name': b['name'], 'relationship_type':'companion'} for b in json_data['related_bills']] # add primary sponsor bill.add_sponsorship_by_identifier(json_data['sponsor']['name'], 'person', 'person', True, scheme='thomas_id', identifier=json_data['sponsor']['thomas_id'], chamber=self.TYPE_MAP[json_data['bill_type']]['chamber']) # add cosponsors for cs in json_data['cosponsors']: bill.add_sponsorship_by_identifier(cs['name'], 'person', 'person', False, scheme='thomas_id', identifier=cs['thomas_id'], chamber=self.TYPE_MAP[json_data['bill_type']]['chamber']) # add introduced_at and actions bill.actions.append({'date': json_data['introduced_at'], 'type': 'introduced', 'description': 'date of introduction', 'actor': self.TYPE_MAP[json_data['bill_type']]['chamber'], 'related_entities': []}) for action in json_data['actions']: bill.actions.append({'date': action['acted_at'], 'type': [action['type']], 'description': action['text'], 'actor': self.TYPE_MAP[json_data['bill_type']]['chamber'], 'related_entities': [] }) # add bill versions for version_path in find_files(os.path.join(settings.SCRAPED_DATA_DIR, 'data', bill.session, 'bills', json_data['bill_type'], json_data['bill_type'] + json_data['number'], 'text-versions'), '*\.json'): try: with open(version_path) as version_file: version_json_data = json.load(version_file) for k, v in version_json_data['urls'].iteritems(): bill.versions.append({'date': version_json_data['issued_on'], 'type': version_json_data['version_code'], 'name': self.VERSION_MAP[version_json_data['version_code']], 'links': [{'mimetype': k, 'url': v}]}) except IOError: print("Unable to open or parse file with path " + version_path) continue yield bill except IOError: print("Unable to open or parse file with path " + filename) continue
def scrape_bills(self): """ Does the following 1) Scrapes bill data from unitedstates project and saves the data to path specified in UnitedStates module 2) Iterates over bill data and converts each one to an OCD-compliant bill model. 3) Yields the OCD-compliant bill model instance @return: yield Bill instance """ # run scraper first to pull in all the bill data self.run_unitedstates_bill_scraper() # iterate over all the files and build and yield Bill objects for filename in find_files(settings.SCRAPED_DATA_DIR, '.*[a-z]*\/[a-z]*[0-9]*\/data\.json'): try: with open(filename) as json_file: json_data = json.load(json_file) # Initialize Object bill = Bill( self.TYPE_MAP[json_data['bill_type']]['canonical'] + ' ' + json_data['number'], json_data['congress'], json_data['official_title'], chamber=self.TYPE_MAP[ json_data['bill_type']]['chamber']) # Basics bill.type = [json_data['bill_type']] bill.subject = json_data['subjects'] bill.add_summary(json_data['summary']['as'], json_data['summary']['text'], json_data['summary']['date']) # Common Fields bill.sources = [{'url': json_data['url'], 'note': 'all'}] # Other/Related Bills bill.other_titles = [{ 'note': t['type'], 'title': t['title'] } for t in json_data['titles']] # change value of relationship_type to 'type' field from json_data when permitted by schema bill.related_bills = [{ 'session': b['session'], 'name': b['name'], 'relationship_type': 'companion' } for b in json_data['related_bills']] # add primary sponsor bill.add_sponsorship_by_identifier( json_data['sponsor']['name'], 'person', 'person', True, scheme='thomas_id', identifier=json_data['sponsor']['thomas_id'], chamber=self.TYPE_MAP[ json_data['bill_type']]['chamber']) # add cosponsors for cs in json_data['cosponsors']: bill.add_sponsorship_by_identifier( cs['name'], 'person', 'person', False, scheme='thomas_id', identifier=cs['thomas_id'], chamber=self.TYPE_MAP[ json_data['bill_type']]['chamber']) # add introduced_at and actions bill.actions.append({ 'date': json_data['introduced_at'], 'type': 'introduced', 'description': 'date of introduction', 'actor': self.TYPE_MAP[json_data['bill_type']]['chamber'], 'related_entities': [] }) for action in json_data['actions']: bill.actions.append({ 'date': action['acted_at'], 'type': [action['type']], 'description': action['text'], 'actor': self.TYPE_MAP[json_data['bill_type']]['chamber'], 'related_entities': [] }) # add bill versions for version_path in find_files( os.path.join( settings.SCRAPED_DATA_DIR, 'data', bill.session, 'bills', json_data['bill_type'], json_data['bill_type'] + json_data['number'], 'text-versions'), '*\.json'): try: with open(version_path) as version_file: version_json_data = json.load(version_file) for k, v in version_json_data[ 'urls'].iteritems(): bill.versions.append({ 'date': version_json_data['issued_on'], 'type': version_json_data['version_code'], 'name': self.VERSION_MAP[ version_json_data['version_code']], 'links': [{ 'mimetype': k, 'url': v }] }) except IOError: print("Unable to open or parse file with path " + version_path) continue yield bill except IOError: print("Unable to open or parse file with path " + filename) continue