def recognize_dates(): config = Config(hostname="phila.legistar.com", sponsor_links=False, date_format="%m/%d/%Y").defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) summaries = scraper.searchLegislation("") summary = summaries.next() import datetime assert_is_instance(summary["File Created"], datetime.datetime)
def recognize_dates(): config = {"hostname": "phila.legistar.com", "date_format": "%m/%d/%Y", "fulltext": True} scraper = LegistarScraper(config) summaries = scraper.searchLegislation("") summary = summaries.next() import datetime assert_is_instance(summary["File Created"], datetime.datetime)
def supports_simple_initial_search_form(): config = {"hostname": "phila.legistar.com", "fulltext": True} scraper = LegistarScraper(config) summaries = scraper.searchLegislation("") try: summaries.next() except StopIteration: fail("no legislation found")
def supports_simple_initial_search_form(): config = Config(hostname="phila.legistar.com", fulltext=True).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) summaries = scraper.searchLegislation("") try: summaries.next() except StopIteration: fail("no legislation found")
def paging_through_legislation(): config = Config(hostname="chicago.legistar.com", fulltext=True).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) summaries = list(scraper.searchLegislation("pub")) # Making summaries a list forces the scraper to iterate completely through # the generator for s in summaries: print s["Record #"] assert_greater(len(summaries), 100)
def supports_advanced_initial_search_form(): config = Config(hostname="chicago.legistar.com", fulltext=True).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) summaries = scraper.searchLegislation("") try: summaries.next() except StopIteration: # fail('no legislation found') assert False
def supports_simple_initial_search_form(): config = Config(hostname='phila.legistar.com', fulltext=True).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) summaries = scraper.searchLegislation('') try: summaries.next() except StopIteration: fail('no legislation found')
def paging_through_results(): config = {"hostname": "chicago.legistar.com", "fulltext": True} scraper = LegistarScraper(config) summaries = list(scraper.searchLegislation("pub")) # Making summaries a list forces the scraper to iterate completely through # the generator for s in summaries: print s["Record #"] assert_greater(len(summaries), 100)
def paging_through_legislation(): config = Config(hostname='chicago.legistar.com', fulltext=True).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) summaries = list(scraper.searchLegislation('pub')) # Making summaries a list forces the scraper to iterate completely through # the generator for s in summaries: print s['Record #'] assert_greater(len(summaries), 100)
def supports_advanced_initial_search_form(): config = Config(hostname='chicago.legistar.com', fulltext=True).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) summaries = scraper.searchLegislation('') try: summaries.next() except StopIteration: #fail('no legislation found') assert False
def recognize_dates(): config = Config( hostname='phila.legistar.com', sponsor_links=False, date_format='%m/%d/%Y', ).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) summaries = scraper.searchLegislation('') summary = summaries.next() import datetime assert_is_instance(summary['File Created'], datetime.datetime)
class HostedLegistarSiteWrapper (object): """ A generic facade over hosted legistar site data scraper. It is responsible for interpreting data scraped out of the site by LegistarScraper. The main external point of interaction is scrape_legis_file. NOTE that this is a superclass that will not run by itself and isn't meant to be; you are expected to run a subclass that implements some functions with names starting with "pluck". requires: BeautifulSoup, mechanize """ def __init__(self, cmdline_options, **options): self.scraper = LegistarScraper(options) if cmdline_options['year']: self.legislation_summaries = self.scraper.searchLegislation('',year=cmdline_options['year']) else: self.legislation_summaries = self.scraper.searchLegislation('') def scrape_legis_file(self, key, summary): '''Extract a record from the given document (soup). The key is for the sake of record-keeping. It is the key passed to the site URL.''' while True : try: legislation_attrs, legislation_history = self.scraper.expandLegislationSummary(summary) break except urllib2.URLError as e: print e print 'skipping to next leg record' except AttributeError as e : print e print 'skipping to next leg record' while True : try: summary = self.legislation_summaries.next() break except urllib2.URLError as e: print e print 'sleeping for five minutes' time.sleep('360') parsed_url = urlparse.urlparse(summary['URL']) key = urlparse.parse_qs(parsed_url.query)['ID'][0] record = self.pluck_record(key, summary, legislation_attrs) attachments = self.pluck_attachments(key, legislation_attrs) actions = [] for act in legislation_history : act_details = act_votes = [] try: act_details, act_votes = self.scraper.expandHistorySummary(act) except (KeyError, AttributeError) as e: log.debug('LegAction has no url') else: if act_votes: print "act_votes", act_votes try: action = self.pluck_action(key, act, act_details, act_votes) except TypeError as e: print e print summary continue except KeyError as e : print act print e print summary raise actions.append(action) # we should probably remove this from the model since the hosted # legistar does not have minutes minutes = [] log.info('Scraped legfile with key %r' % (key,)) log.debug("%r %r %r %r" % (record, attachments, actions, minutes)) return record, attachments, actions, minutes def convert_date(self, orig_date): if orig_date: return datetime.datetime.strptime(orig_date, '%m/%d/%Y').date() else: return '' def check_for_new_content(self, last_key): '''Grab the next legislation summary row. Doesn't use the last_key parameter; just starts at the beginning for each instance of the scraper. ''' try: print 'next leg record' next_summary = self.legislation_summaries.next() return 0, next_summary except StopIteration: return None, None def init_pdf_cache(self, pdf_mapping) : pass
class HostedLegistarSiteWrapper(object): """ A facade over the Philadelphia city council legistar site data. It is responsible for scraping data out of the site. The main external point of interaction is scrape_legis_file. requires: BeautifulSoup, mechanize """ def __init__(self, **options): self.scraper = LegistarScraper(options) self.legislation_summaries = self.scraper.searchLegislation( '', created_before='2012-10-5') def scrape_legis_file(self, key, summary): '''Extract a record from the given document (soup). The key is for the sake of record-keeping. It is the key passed to the site URL.''' while True: try: legislation_attrs, legislation_history = self.scraper.expandLegislationSummary( summary) break except urllib2.URLError as e: print e print 'skipping to next leg record' except AttributeError as e: print e print 'skipping to next leg record' while True: try: summary = self.legislation_summaries.next() break except urllib2.URLError as e: print e print 'sleeping for five minutes' time.sleep('360') parsed_url = urlparse.urlparse(summary['URL']) key = urlparse.parse_qs(parsed_url.query)['ID'][0] # re-order the sponsor name by '[First] [Last]' instead of '[Last], [First]' sponsors = legislation_attrs['Sponsors'] first_name_first_sponsors = [] for sponsor in sponsors: if ',' in sponsor: name_list = sponsor.split(',') name_list.reverse() sponsor = ' '.join(name_list).strip() first_name_first_sponsors.append(sponsor) record = { 'key': key, 'id': summary['Record #'], 'url': summary['URL'], 'type': summary['Type'], 'status': summary['Status'], 'title': summary['Title'], 'controlling_body': legislation_attrs['Current Controlling Legislative Body'], 'intro_date': self.convert_date(summary['Intro Date']), 'final_date': self.convert_date(summary.setdefault('Final Date', '')), 'version': summary.setdefault('Version', ''), #'contact' : None, 'sponsors': first_name_first_sponsors, # probably remove this from the model as well 'minutes_url': None } try: attachments = legislation_attrs['Attachments'] for attachment in attachments: attachment['key'] = key attachment['file'] = attachment['label'] attachment['description'] = attachment['label'] del attachment['label'] except KeyError: attachments = [] actions = [] for act in legislation_history: try: act_details, act_votes = self.scraper.expandHistorySummary(act) except (KeyError, AttributeError) as e: print e print summary continue try: action = { 'key': key, 'date_taken': self.convert_date(act['Date']), 'acting_body': act['Action By']['label'], 'motion': act['Result'], 'description': act['Action'], 'notes': '' } except TypeError as e: print e print summary continue except KeyError as e: print act print e print summary raise actions.append(action) # we should probably remove this from the model since the hosted # legistar does not have minutes minutes = [] log.info('Scraped legfile with key %r' % (key, )) log.debug("%r %r %r %r" % (record, attachments, actions, minutes)) return record, attachments, actions, minutes def convert_date(self, orig_date): if orig_date: return datetime.datetime.strptime(orig_date, '%m/%d/%Y').date() else: return '' def check_for_new_content(self, last_key): '''Grab the next legislation summary row. Doesn't use the last_key parameter; just starts at the beginning for each instance of the scraper. ''' try: print 'next leg record' next_summary = self.legislation_summaries.next() return 0, next_summary except StopIteration: return None, None def init_pdf_cache(self, pdf_mapping): pass
class HostedLegistarSiteWrapper(object): """ A facade over the Philadelphia city council legistar site data. It is responsible for scraping data out of the site. The main external point of interaction is scrape_legis_file. requires: BeautifulSoup, mechanize """ def __init__(self, **options): self.id_label = options.pop('id_label', 'Record #') self.url_label = options.pop('url_label', 'URL') self.type_label = options.pop('type_label', 'Type') self.status_label = options.pop('status_label', 'Status') self.title_label = options.pop('title_label', 'Title') self.topics_label = options.pop('topics_label', 'Topic') self.intro_date_label = options.pop('intro_date_label', 'Intro Date') self.final_date_label = options.pop('final_date_label', 'Final Date') self.controlling_body_label = options.pop( 'controlling_body_label', 'Current Controlling Legislative Body') self.version_label = options.pop('version_label', 'Version') self.scraper = LegistarScraper(options) self.legislation_summaries = self.scraper.searchLegislation( '', created_before='2012-10-5') def scrape_legis_file(self, key, summary): '''Extract a record from the given document (soup). The key is for the sake of record-keeping. It is the key passed to the site URL.''' while True: try: legislation_attrs, legislation_history = self.scraper.expandLegislationSummary( summary) break except urllib2.URLError as e: log.warning(e) log.warning('skipping to next leg record') except AttributeError as e: log.warning(e) log.warning('skipping to next leg record') while True: try: summary = self.legislation_summaries.next() break except urllib2.URLError as e: log.warning(e) log.warning('sleeping for five minutes') time.sleep('360') parsed_url = urlparse.urlparse(summary['URL']) key = urlparse.parse_qs(parsed_url.query)['ID'][0] # re-order the sponsor name by '[First] [Last]' instead of '[Last], [First]' sponsors = legislation_attrs['Sponsors'] first_name_first_sponsors = [] for sponsor in sponsors: if ',' in sponsor: name_list = sponsor.split(',') name_list.reverse() sponsor = ' '.join(name_list).strip() first_name_first_sponsors.append(sponsor) topics = legislation_attrs.get('Topics', None) if topics is None: joined_topics = legislation_attrs.get(self.topics_label, '') topics = [topic.strip() for topic in joined_topics.split(',')] try: record = { 'key': key, 'id': summary[self.id_label], 'url': summary[self.url_label], 'type': summary[self.type_label], 'status': summary[self.status_label], 'title': summary[self.title_label], 'topics': topics, 'controlling_body': legislation_attrs[self.controlling_body_label], 'intro_date': self.convert_date(summary[self.intro_date_label]), 'final_date': self.convert_date(summary.setdefault(self.final_date_label, '')), 'version': summary.setdefault(self.version_label, ''), #'contact' : None, 'sponsors': first_name_first_sponsors, # probably remove this from the model as well 'minutes_url': None } except KeyError, e: raise ScrapeError('Failed to find key %s in either summary keys ' '(%r) or attrs (%r)' % (e, summary.keys(), legislation_attrs.keys())) try: attachments = legislation_attrs['Attachments'] for attachment in attachments: attachment['key'] = key attachment['file'] = attachment['label'] attachment['description'] = attachment['label'] del attachment['label'] except KeyError: attachments = [] actions = [] for act in legislation_history: try: act_details, act_votes = self.scraper.expandHistorySummary(act) except (KeyError, AttributeError) as e: print e print summary continue try: acting_body = act['Action By'] if not isinstance(acting_body, basestring): acting_body = acting_body['label'] action = { 'key': key, 'date_taken': self.convert_date(act['Date']), 'acting_body': acting_body, 'motion': act['Result'], 'description': act['Action'], 'notes': '', 'votes': [{ 'voter': vote['Person Name'], 'value': vote['Vote'] } for vote in act_votes] } except TypeError as e: print e print summary continue except KeyError as e: print act print e print summary raise actions.append(action) # we should probably remove this from the model since the hosted # legistar does not have minutes minutes = [] log.info('Scraped legfile with key %r' % (key, )) log.debug("%r %r %r %r" % (record, attachments, actions, minutes)) return record, attachments, actions, minutes
class HostedLegistarSiteWrapper (object): """ A facade over the Philadelphia city council legistar site data. It is responsible for scraping data out of the site. The main external point of interaction is scrape_legis_file. requires: BeautifulSoup, mechanize """ def __init__(self, **options): self.id_label = options.pop('id_label', 'Record #') self.url_label = options.pop('url_label', 'URL') self.type_label = options.pop('type_label', 'Type') self.status_label = options.pop('status_label', 'Status') self.title_label = options.pop('title_label', 'Title') self.indexes_label = options.pop('indexes_label', 'Indexes') self.intro_date_label = options.pop('intro_date_label', 'Intro Date') self.final_date_label = options.pop('final_date_label', 'Final Date') self.controlling_body_label = options.pop('controlling_body_label', 'Current Controlling Legislative Body') self.version_label = options.pop('version_label', 'Version') self.scraper = LegistarScraper(options) self.legislation_summaries = self.scraper.searchLegislation('', created_before='2012-10-5') def scrape_legis_file(self, key, summary): '''Extract a record from the given document (soup). The key is for the sake of record-keeping. It is the key passed to the site URL.''' while True : try: legislation_attrs, legislation_history = self.scraper.expandLegislationSummary(summary) break except urllib2.URLError as e: log.warning(e) log.warning('skipping to next leg record') except AttributeError as e : log.warning(e) log.warning('skipping to next leg record') while True : try: summary = self.legislation_summaries.next() break except urllib2.URLError as e: log.warning(e) log.warning('sleeping for five minutes') time.sleep('360') parsed_url = urlparse.urlparse(summary['URL']) key = urlparse.parse_qs(parsed_url.query)['ID'][0] # re-order the sponsor name by '[First] [Last]' instead of '[Last], [First]' sponsors = legislation_attrs['Sponsors'] first_name_first_sponsors = [] for sponsor in sponsors : if ',' in sponsor : name_list = sponsor.split(',') name_list.reverse() sponsor = ' '.join(name_list).strip() first_name_first_sponsors.append(sponsor) try: record = { 'key' : key, 'id' : summary[self.id_label], 'url' : summary[self.url_label], 'type' : summary[self.type_label], 'status' : summary[self.status_label], 'title' : summary[self.title_label], 'indexes': legislation_attrs[self.indexes_label], 'controlling_body' : legislation_attrs[self.controlling_body_label], 'intro_date' : self.convert_date(summary[self.intro_date_label]), 'final_date' : self.convert_date(summary.setdefault(self.final_date_label, '')), 'version' : summary.setdefault(self.version_label, ''), #'contact' : None, 'sponsors' : first_name_first_sponsors, # probably remove this from the model as well 'minutes_url' : None } except KeyError, e: raise ScrapeError('Failed to find key %s in either summary keys ' '(%r) or attrs (%r)' % (e, summary.keys(), legislation_attrs.keys())) try: attachments = legislation_attrs['Attachments'] for attachment in attachments: attachment['key'] = key attachment['file'] = attachment['label'] attachment['description'] = attachment['label'] del attachment['label'] except KeyError: attachments = [] actions = [] for act in legislation_history : try: act_details, act_votes = self.scraper.expandHistorySummary(act) except (KeyError, AttributeError) as e: print e print summary continue try: acting_body = act['Action By'] if not isinstance(acting_body, basestring): acting_body = acting_body['label'] action = { 'key' : key, 'date_taken' : self.convert_date(act['Date']), 'acting_body' : acting_body, 'motion' : act['Result'], 'description' : act['Action'], 'notes' : '', 'votes': [{'voter': vote['Person Name'], 'value': vote['Vote']} for vote in act_votes] } except TypeError as e: print e print summary continue except KeyError as e : print act print e print summary raise actions.append(action) # we should probably remove this from the model since the hosted # legistar does not have minutes minutes = [] log.info('Scraped legfile with key %r' % (key,)) log.debug("%r %r %r %r" % (record, attachments, actions, minutes)) return record, attachments, actions, minutes
class HostedLegistarSiteWrapper (object): """ A facade over the Philadelphia city council legistar site data. It is responsible for scraping data out of the site. The main external point of interaction is scrape_legis_file. requires: BeautifulSoup, mechanize """ def __init__(self, **options): self.scraper = LegistarScraper(options) self.legislation_summaries = self.scraper.searchLegislation('', created_before='2012-10-5') def scrape_legis_file(self, key, summary): '''Extract a record from the given document (soup). The key is for the sake of record-keeping. It is the key passed to the site URL.''' while True : try: legislation_attrs, legislation_history = self.scraper.expandLegislationSummary(summary) break except urllib2.URLError as e: print e print 'skipping to next leg record' except AttributeError as e : print e print 'skipping to next leg record' while True : try: summary = self.legislation_summaries.next() break except urllib2.URLError as e: print e print 'sleeping for five minutes' time.sleep('360') parsed_url = urlparse.urlparse(summary['URL']) key = urlparse.parse_qs(parsed_url.query)['ID'][0] # re-order the sponsor name by '[First] [Last]' instead of '[Last], [First]' sponsors = legislation_attrs['Sponsors'] first_name_first_sponsors = [] for sponsor in sponsors : if ',' in sponsor : name_list = sponsor.rsplit(',', 1) name_list.reverse() sponsor = ' '.join(name_list).strip().replace(',', '') first_name_first_sponsors.append(sponsor) record = { 'key' : key, 'id' : summary['Record #'], 'url' : summary['URL'], 'type' : summary['Type'], 'status' : summary['Status'], 'title' : summary['Title'], 'controlling_body' : legislation_attrs['Current Controlling Legislative Body'], 'intro_date' : self.convert_date(summary['Intro Date']), 'final_date' : self.convert_date(summary.setdefault('Final Date', '')), 'version' : summary.setdefault('Version', ''), #'contact' : None, 'sponsors' : first_name_first_sponsors, # probably remove this from the model as well 'minutes_url' : None } try: attachments = legislation_attrs['Attachments'] for attachment in attachments: attachment['key'] = key attachment['file'] = attachment['label'] attachment['description'] = attachment['label'] del attachment['label'] except KeyError: attachments = [] actions = [] for act in legislation_history : try: act_details, act_votes = self.scraper.expandHistorySummary(act) except (KeyError, AttributeError) as e: print e print summary continue try: action = { 'key' : key, 'date_taken' : self.convert_date(act['Date']), 'acting_body' : act['Action By']['label'], 'motion' : act['Result'], 'description' : act['Action'], 'notes' : '' } except TypeError as e: print e print summary continue except KeyError as e : print act print e print summary raise actions.append(action) # we should probably remove this from the model since the hosted # legistar does not have minutes minutes = [] log.info('Scraped legfile with key %r' % (key,)) log.debug("%r %r %r %r" % (record, attachments, actions, minutes)) return record, attachments, actions, minutes def convert_date(self, orig_date): if orig_date: return datetime.datetime.strptime(orig_date, '%m/%d/%Y').date() else: return '' def check_for_new_content(self, last_key): '''Grab the next legislation summary row. Doesn't use the last_key parameter; just starts at the beginning for each instance of the scraper. ''' try: print 'next leg record' next_summary = self.legislation_summaries.next() return 0, next_summary except StopIteration: return None, None def init_pdf_cache(self, pdf_mapping) : pass