Python NewsItemListDetailScraper Beispiele

Programmiersprache: Python

Namespace / Paketname: ebdata.retrieval.scrapers.newsitem_list_detail

Beispiele auf hotexamples.com: 21

Python NewsItemListDetailScraper - 21 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die ebdata.retrieval.scrapers.newsitem_list_detail.NewsItemListDetailScraper, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

__init__(6)

parse_list(4)

get_html(1)

parse_detail(1)

Beispiel #1

Datei anzeigen

Datei: retrieval.py Projekt: vijayaraju/everyblock-1

 def __init__(self, name_start=''):
     # name_start, if given, should be a string of the first restaurant name
     # to start scraping, alphabetically. This is useful if you've run the
     # scraper and it's broken several hours into it -- you can pick up
     # around where it left off.
     NewsItemListDetailScraper.__init__(self)
     self.name_start = name_start.lower()

Beispiel #2

Datei anzeigen

Datei: retrieval.py Projekt: christaggart/openblock

 def __init__(self, name_start=""):
     # name_start, if given, should be a string of the first restaurant name
     # to start scraping, alphabetically. This is useful if you've run the
     # scraper and it's broken several hours into it -- you can pick up
     # around where it left off.
     NewsItemListDetailScraper.__init__(self)
     self.name_start = name_start.lower()

Beispiel #3

Datei anzeigen

Datei: retrieval.py Projekt: frankk00/openblock

 def __init__(self, mdb_filename=None):
     # If mdb_filename is given, it should be the name of an MDB file on the
     # local filesystem to import. Otherwise, this will try to find the
     # latest one available online.
     NewsItemListDetailScraper.__init__(self)
     self._local_mdb_filename = mdb_filename
     self._mdb_filename = None
     self._locations_cache = self._inspection_type_cache = self._violations_cache = self._violation_type_cache = None

Beispiel #4

Datei anzeigen

 def parse_list(self, page):
     records = list(NewsItemListDetailScraper.parse_list(self, page))
     self.logger.debug('Got %s records', len(records))
     if len(records) >= 99:
         raise ScraperBroken(
             'Got %s records. Consider changing date interval' %
             len(records))
     return records

Beispiel #5

Datei anzeigen

Datei: retrieval.py Projekt: frankk00/openblock

 def get_html(self, *args, **kwargs):
     MAX_TRIES = 4
     tries = 0
     while tries <= MAX_TRIES:
         html = NewsItemListDetailScraper.get_html(self, *args, **kwargs)
         if 'Unable to connect to PostgreSQL server' in html:
             self.logger.debug('Got "Unable to connect to PostgreSQL" error')
             time.sleep(3)
             continue
         return html
     raise ScraperBroken('Got PostgreSQL error %s times' % MAX_TRIES)

Beispiel #6

Datei anzeigen

Datei: retrieval.py Projekt: vijayaraju/everyblock-1

 def get_html(self, *args, **kwargs):
     MAX_TRIES = 4
     tries = 0
     while tries <= MAX_TRIES:
         html = NewsItemListDetailScraper.get_html(self, *args, **kwargs)
         if 'Unable to connect to PostgreSQL server' in html:
             self.logger.debug('Got "Unable to connect to PostgreSQL" error')
             time.sleep(3)
             continue
         return html
     raise ScraperBroken('Got PostgreSQL error %s times' % MAX_TRIES)

Beispiel #7

Datei anzeigen

Datei: retrieval.py Projekt: DotNetWebs/openblock

    def parse_list(self, record_html):
        # Normally this method gets passed raw html,
        # but we return both the html and the list_record from list_pages().
        list_record, html = record_html
        # a better version of the restaurant address is available on this page,
        # attempt to extract additional location details to resolve ambiguities.
        try:
            info = self.detail_address_re.search(html).groupdict()
            list_record['zipcode'] = info['zipcode']
        except:
            self.logger.info("Could not get detailed address information for record %s: %s" % (list_record['restaurant_id'], list_record['restaurant_name']))

        for record in NewsItemListDetailScraper.parse_list(self, html):
            yield dict(list_record, **record)

Beispiel #8

Datei anzeigen

Datei: retrieval.py Projekt: slinkp/openblock

    def parse_list(self, record_html):
        # Normally this method gets passed raw html,
        # but we return both the html and the list_record from list_pages().
        list_record, html = record_html
        # a better version of the restaurant address is available on this page,
        # attempt to extract additional location details to resolve ambiguities.
        try:
            info = self.detail_address_re.search(html).groupdict()
            list_record['zipcode'] = info['zipcode']
        except:
            self.logger.info(
                "Could not get detailed address information for record %s: %s"
                %
                (list_record['restaurant_id'], list_record['restaurant_name']))

        for record in NewsItemListDetailScraper.parse_list(self, html):
            yield dict(list_record, **record)

Beispiel #9

Datei anzeigen

 def __init__(self, filename=None):
     self.woodstock_filename = filename
     NewsItemListDetailScraper.__init__(self)

Beispiel #10

Datei anzeigen

Datei: retrieval.py Projekt: vijayaraju/everyblock-1

 def parse_list(self, record_html):
     list_record, html = record_html
     for record in NewsItemListDetailScraper.parse_list(self, html):
         yield dict(list_record, **record)

Beispiel #11

Datei anzeigen

Datei: retrieval.py Projekt: AndrewJHart/everyblock_code

 def __init__(self, filename=None):
     self.woodstock_filename = filename
     NewsItemListDetailScraper.__init__(self)

Beispiel #12

Datei anzeigen

Datei: retrieval.py Projekt: frankk00/openblock

 def parse_list(self, page):
     records = list(NewsItemListDetailScraper.parse_list(self, page))
     self.logger.debug('Got %s records', len(records))
     if len(records) >= 99:
         raise ScraperBroken('Got %s records. Consider changing date interval' % len(records))
     return records

Beispiel #13

Datei anzeigen

Datei: retrieval.py Projekt: vijayaraju/everyblock-1

 def __init__(self, start_date=None, end_date=None):
     NewsItemListDetailScraper.__init__(self)
     today = datetime.date.today()
     self.start_date = start_date or (today - datetime.timedelta(days=31))
     self.end_date = end_date or today

Beispiel #14

Datei anzeigen

Datei: retrieval.py Projekt: frankk00/openblock

 def __init__(self, start_date=None, end_date=None):
     NewsItemListDetailScraper.__init__(self, use_cache=False)
     if start_date is None:
         today = datetime.date.today()
         start_date, end_date = today - datetime.timedelta(days=5), today
     self.start_date, self.end_date = start_date, end_date

Beispiel #15

Datei anzeigen

 def parse_detail(self, page, list_record):
     # They use a ton of &nbsp;s for some reason, so convert them to spaces
     # to make the parse_detail_re regex more readable.
     page = page.replace('&nbsp;', ' ')
     return NewsItemListDetailScraper.parse_detail(self, page, list_record)

Beispiel #16

Datei anzeigen

 def __init__(self, start_date=None, end_date=None):
     NewsItemListDetailScraper.__init__(self, use_cache=False)
     if start_date is None:
         today = datetime.date.today()
         start_date, end_date = today - datetime.timedelta(days=5), today
     self.start_date, self.end_date = start_date, end_date

Beispiel #17

Datei anzeigen

 def parse_list(self, page):
     facility_type, html = page
     for record in NewsItemListDetailScraper.parse_list(self, html):
         yield dict(record, facility_type=facility_type)

Beispiel #18

Datei anzeigen

Datei: retrieval.py Projekt: christaggart/openblock

 def parse_list(self, record_html):
     list_record, html = record_html
     for record in NewsItemListDetailScraper.parse_list(self, html):
         yield dict(list_record, **record)

Beispiel #19

Datei anzeigen

Datei: retrieval.py Projekt: frankk00/openblock

 def __init__(self, start_date=None, end_date=None):
     NewsItemListDetailScraper.__init__(self)
     today = datetime.date.today()
     self.start_date = start_date or (today - datetime.timedelta(days=31))
     self.end_date = end_date or today

Beispiel #20

Datei anzeigen

Datei: retrieval.py Projekt: frankk00/openblock

 def parse_list(self, page):
     facility_type, html = page
     for record in NewsItemListDetailScraper.parse_list(self, html):
         yield dict(record, facility_type=facility_type)

Beispiel #21

Datei anzeigen

Datei: retrieval.py Projekt: vijayaraju/everyblock-1

 def __init__(self, hours=8, *args, **kwargs):
     self.num_hours = hours
     NewsItemListDetailScraper.__init__(self, *args, **kwargs)