def __init__(self, name_start=''): # name_start, if given, should be a string of the first restaurant name # to start scraping, alphabetically. This is useful if you've run the # scraper and it's broken several hours into it -- you can pick up # around where it left off. NewsItemListDetailScraper.__init__(self) self.name_start = name_start.lower()
def __init__(self, name_start=""): # name_start, if given, should be a string of the first restaurant name # to start scraping, alphabetically. This is useful if you've run the # scraper and it's broken several hours into it -- you can pick up # around where it left off. NewsItemListDetailScraper.__init__(self) self.name_start = name_start.lower()
def __init__(self, mdb_filename=None): # If mdb_filename is given, it should be the name of an MDB file on the # local filesystem to import. Otherwise, this will try to find the # latest one available online. NewsItemListDetailScraper.__init__(self) self._local_mdb_filename = mdb_filename self._mdb_filename = None self._locations_cache = self._inspection_type_cache = self._violations_cache = self._violation_type_cache = None
def parse_list(self, page): records = list(NewsItemListDetailScraper.parse_list(self, page)) self.logger.debug('Got %s records', len(records)) if len(records) >= 99: raise ScraperBroken( 'Got %s records. Consider changing date interval' % len(records)) return records
def get_html(self, *args, **kwargs): MAX_TRIES = 4 tries = 0 while tries <= MAX_TRIES: html = NewsItemListDetailScraper.get_html(self, *args, **kwargs) if 'Unable to connect to PostgreSQL server' in html: self.logger.debug('Got "Unable to connect to PostgreSQL" error') time.sleep(3) continue return html raise ScraperBroken('Got PostgreSQL error %s times' % MAX_TRIES)
def get_html(self, *args, **kwargs): MAX_TRIES = 4 tries = 0 while tries <= MAX_TRIES: html = NewsItemListDetailScraper.get_html(self, *args, **kwargs) if 'Unable to connect to PostgreSQL server' in html: self.logger.debug('Got "Unable to connect to PostgreSQL" error') time.sleep(3) continue return html raise ScraperBroken('Got PostgreSQL error %s times' % MAX_TRIES)
def parse_list(self, record_html): # Normally this method gets passed raw html, # but we return both the html and the list_record from list_pages(). list_record, html = record_html # a better version of the restaurant address is available on this page, # attempt to extract additional location details to resolve ambiguities. try: info = self.detail_address_re.search(html).groupdict() list_record['zipcode'] = info['zipcode'] except: self.logger.info("Could not get detailed address information for record %s: %s" % (list_record['restaurant_id'], list_record['restaurant_name'])) for record in NewsItemListDetailScraper.parse_list(self, html): yield dict(list_record, **record)
def parse_list(self, record_html): # Normally this method gets passed raw html, # but we return both the html and the list_record from list_pages(). list_record, html = record_html # a better version of the restaurant address is available on this page, # attempt to extract additional location details to resolve ambiguities. try: info = self.detail_address_re.search(html).groupdict() list_record['zipcode'] = info['zipcode'] except: self.logger.info( "Could not get detailed address information for record %s: %s" % (list_record['restaurant_id'], list_record['restaurant_name'])) for record in NewsItemListDetailScraper.parse_list(self, html): yield dict(list_record, **record)
def __init__(self, filename=None): self.woodstock_filename = filename NewsItemListDetailScraper.__init__(self)
def parse_list(self, record_html): list_record, html = record_html for record in NewsItemListDetailScraper.parse_list(self, html): yield dict(list_record, **record)
def __init__(self, filename=None): self.woodstock_filename = filename NewsItemListDetailScraper.__init__(self)
def parse_list(self, page): records = list(NewsItemListDetailScraper.parse_list(self, page)) self.logger.debug('Got %s records', len(records)) if len(records) >= 99: raise ScraperBroken('Got %s records. Consider changing date interval' % len(records)) return records
def __init__(self, start_date=None, end_date=None): NewsItemListDetailScraper.__init__(self) today = datetime.date.today() self.start_date = start_date or (today - datetime.timedelta(days=31)) self.end_date = end_date or today
def __init__(self, start_date=None, end_date=None): NewsItemListDetailScraper.__init__(self, use_cache=False) if start_date is None: today = datetime.date.today() start_date, end_date = today - datetime.timedelta(days=5), today self.start_date, self.end_date = start_date, end_date
def parse_detail(self, page, list_record): # They use a ton of s for some reason, so convert them to spaces # to make the parse_detail_re regex more readable. page = page.replace(' ', ' ') return NewsItemListDetailScraper.parse_detail(self, page, list_record)
def __init__(self, start_date=None, end_date=None): NewsItemListDetailScraper.__init__(self, use_cache=False) if start_date is None: today = datetime.date.today() start_date, end_date = today - datetime.timedelta(days=5), today self.start_date, self.end_date = start_date, end_date
def parse_list(self, page): facility_type, html = page for record in NewsItemListDetailScraper.parse_list(self, html): yield dict(record, facility_type=facility_type)
def parse_list(self, record_html): list_record, html = record_html for record in NewsItemListDetailScraper.parse_list(self, html): yield dict(list_record, **record)
def __init__(self, start_date=None, end_date=None): NewsItemListDetailScraper.__init__(self) today = datetime.date.today() self.start_date = start_date or (today - datetime.timedelta(days=31)) self.end_date = end_date or today
def parse_list(self, page): facility_type, html = page for record in NewsItemListDetailScraper.parse_list(self, html): yield dict(record, facility_type=facility_type)
def __init__(self, hours=8, *args, **kwargs): self.num_hours = hours NewsItemListDetailScraper.__init__(self, *args, **kwargs)