Ejemplo n.º 1
0
 def __init__(self, name_start=''):
     # name_start, if given, should be a string of the first restaurant name
     # to start scraping, alphabetically. This is useful if you've run the
     # scraper and it's broken several hours into it -- you can pick up
     # around where it left off.
     NewsItemListDetailScraper.__init__(self)
     self.name_start = name_start.lower()
Ejemplo n.º 2
0
 def __init__(self, name_start=""):
     # name_start, if given, should be a string of the first restaurant name
     # to start scraping, alphabetically. This is useful if you've run the
     # scraper and it's broken several hours into it -- you can pick up
     # around where it left off.
     NewsItemListDetailScraper.__init__(self)
     self.name_start = name_start.lower()
Ejemplo n.º 3
0
 def __init__(self, mdb_filename=None):
     # If mdb_filename is given, it should be the name of an MDB file on the
     # local filesystem to import. Otherwise, this will try to find the
     # latest one available online.
     NewsItemListDetailScraper.__init__(self)
     self._local_mdb_filename = mdb_filename
     self._mdb_filename = None
     self._locations_cache = self._inspection_type_cache = self._violations_cache = self._violation_type_cache = None
Ejemplo n.º 4
0
 def parse_list(self, page):
     records = list(NewsItemListDetailScraper.parse_list(self, page))
     self.logger.debug('Got %s records', len(records))
     if len(records) >= 99:
         raise ScraperBroken(
             'Got %s records. Consider changing date interval' %
             len(records))
     return records
Ejemplo n.º 5
0
 def get_html(self, *args, **kwargs):
     MAX_TRIES = 4
     tries = 0
     while tries <= MAX_TRIES:
         html = NewsItemListDetailScraper.get_html(self, *args, **kwargs)
         if 'Unable to connect to PostgreSQL server' in html:
             self.logger.debug('Got "Unable to connect to PostgreSQL" error')
             time.sleep(3)
             continue
         return html
     raise ScraperBroken('Got PostgreSQL error %s times' % MAX_TRIES)
Ejemplo n.º 6
0
 def get_html(self, *args, **kwargs):
     MAX_TRIES = 4
     tries = 0
     while tries <= MAX_TRIES:
         html = NewsItemListDetailScraper.get_html(self, *args, **kwargs)
         if 'Unable to connect to PostgreSQL server' in html:
             self.logger.debug('Got "Unable to connect to PostgreSQL" error')
             time.sleep(3)
             continue
         return html
     raise ScraperBroken('Got PostgreSQL error %s times' % MAX_TRIES)
Ejemplo n.º 7
0
    def parse_list(self, record_html):
        # Normally this method gets passed raw html,
        # but we return both the html and the list_record from list_pages().
        list_record, html = record_html
        # a better version of the restaurant address is available on this page,
        # attempt to extract additional location details to resolve ambiguities.
        try:
            info = self.detail_address_re.search(html).groupdict()
            list_record['zipcode'] = info['zipcode']
        except:
            self.logger.info("Could not get detailed address information for record %s: %s" % (list_record['restaurant_id'], list_record['restaurant_name']))

        for record in NewsItemListDetailScraper.parse_list(self, html):
            yield dict(list_record, **record)
Ejemplo n.º 8
0
    def parse_list(self, record_html):
        # Normally this method gets passed raw html,
        # but we return both the html and the list_record from list_pages().
        list_record, html = record_html
        # a better version of the restaurant address is available on this page,
        # attempt to extract additional location details to resolve ambiguities.
        try:
            info = self.detail_address_re.search(html).groupdict()
            list_record['zipcode'] = info['zipcode']
        except:
            self.logger.info(
                "Could not get detailed address information for record %s: %s"
                %
                (list_record['restaurant_id'], list_record['restaurant_name']))

        for record in NewsItemListDetailScraper.parse_list(self, html):
            yield dict(list_record, **record)
Ejemplo n.º 9
0
 def __init__(self, filename=None):
     self.woodstock_filename = filename
     NewsItemListDetailScraper.__init__(self)
Ejemplo n.º 10
0
 def parse_list(self, record_html):
     list_record, html = record_html
     for record in NewsItemListDetailScraper.parse_list(self, html):
         yield dict(list_record, **record)
Ejemplo n.º 11
0
 def __init__(self, filename=None):
     self.woodstock_filename = filename
     NewsItemListDetailScraper.__init__(self)
Ejemplo n.º 12
0
 def parse_list(self, page):
     records = list(NewsItemListDetailScraper.parse_list(self, page))
     self.logger.debug('Got %s records', len(records))
     if len(records) >= 99:
         raise ScraperBroken('Got %s records. Consider changing date interval' % len(records))
     return records
Ejemplo n.º 13
0
 def __init__(self, start_date=None, end_date=None):
     NewsItemListDetailScraper.__init__(self)
     today = datetime.date.today()
     self.start_date = start_date or (today - datetime.timedelta(days=31))
     self.end_date = end_date or today
Ejemplo n.º 14
0
 def __init__(self, start_date=None, end_date=None):
     NewsItemListDetailScraper.__init__(self, use_cache=False)
     if start_date is None:
         today = datetime.date.today()
         start_date, end_date = today - datetime.timedelta(days=5), today
     self.start_date, self.end_date = start_date, end_date
Ejemplo n.º 15
0
 def parse_detail(self, page, list_record):
     # They use a ton of &nbsp;s for some reason, so convert them to spaces
     # to make the parse_detail_re regex more readable.
     page = page.replace('&nbsp;', ' ')
     return NewsItemListDetailScraper.parse_detail(self, page, list_record)
Ejemplo n.º 16
0
 def __init__(self, start_date=None, end_date=None):
     NewsItemListDetailScraper.__init__(self, use_cache=False)
     if start_date is None:
         today = datetime.date.today()
         start_date, end_date = today - datetime.timedelta(days=5), today
     self.start_date, self.end_date = start_date, end_date
Ejemplo n.º 17
0
 def parse_list(self, page):
     facility_type, html = page
     for record in NewsItemListDetailScraper.parse_list(self, html):
         yield dict(record, facility_type=facility_type)
Ejemplo n.º 18
0
 def parse_list(self, record_html):
     list_record, html = record_html
     for record in NewsItemListDetailScraper.parse_list(self, html):
         yield dict(list_record, **record)
Ejemplo n.º 19
0
 def __init__(self, start_date=None, end_date=None):
     NewsItemListDetailScraper.__init__(self)
     today = datetime.date.today()
     self.start_date = start_date or (today - datetime.timedelta(days=31))
     self.end_date = end_date or today
Ejemplo n.º 20
0
 def parse_list(self, page):
     facility_type, html = page
     for record in NewsItemListDetailScraper.parse_list(self, html):
         yield dict(record, facility_type=facility_type)
Ejemplo n.º 21
0
 def __init__(self, hours=8, *args, **kwargs):
     self.num_hours = hours
     NewsItemListDetailScraper.__init__(self, *args, **kwargs)