Ejemplos de NewsItemListDetailScraper en Python

Lenguaje de programación: Python

Namespace/Package Name: ebdata.retrieval.scrapers.newsitem_list_detail

Ejemplos en hotexamples.com: 21

Python NewsItemListDetailScraper - 21 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de ebdata.retrieval.scrapers.newsitem_list_detail.NewsItemListDetailScraper extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

__init__(6)

parse_list(4)

get_html(1)

parse_detail(1)

Ejemplo n.º 1

Mostrar archivo

Archivo: retrieval.py Proyecto: vijayaraju/everyblock-1

 def __init__(self, name_start=''):
     # name_start, if given, should be a string of the first restaurant name
     # to start scraping, alphabetically. This is useful if you've run the
     # scraper and it's broken several hours into it -- you can pick up
     # around where it left off.
     NewsItemListDetailScraper.__init__(self)
     self.name_start = name_start.lower()

Ejemplo n.º 2

Mostrar archivo

Archivo: retrieval.py Proyecto: christaggart/openblock

 def __init__(self, name_start=""):
     # name_start, if given, should be a string of the first restaurant name
     # to start scraping, alphabetically. This is useful if you've run the
     # scraper and it's broken several hours into it -- you can pick up
     # around where it left off.
     NewsItemListDetailScraper.__init__(self)
     self.name_start = name_start.lower()

Ejemplo n.º 3

Mostrar archivo

Archivo: retrieval.py Proyecto: frankk00/openblock

 def __init__(self, mdb_filename=None):
     # If mdb_filename is given, it should be the name of an MDB file on the
     # local filesystem to import. Otherwise, this will try to find the
     # latest one available online.
     NewsItemListDetailScraper.__init__(self)
     self._local_mdb_filename = mdb_filename
     self._mdb_filename = None
     self._locations_cache = self._inspection_type_cache = self._violations_cache = self._violation_type_cache = None

Ejemplo n.º 4

Mostrar archivo

 def parse_list(self, page):
     records = list(NewsItemListDetailScraper.parse_list(self, page))
     self.logger.debug('Got %s records', len(records))
     if len(records) >= 99:
         raise ScraperBroken(
             'Got %s records. Consider changing date interval' %
             len(records))
     return records

Ejemplo n.º 5

Mostrar archivo

Archivo: retrieval.py Proyecto: frankk00/openblock

 def get_html(self, *args, **kwargs):
     MAX_TRIES = 4
     tries = 0
     while tries <= MAX_TRIES:
         html = NewsItemListDetailScraper.get_html(self, *args, **kwargs)
         if 'Unable to connect to PostgreSQL server' in html:
             self.logger.debug('Got "Unable to connect to PostgreSQL" error')
             time.sleep(3)
             continue
         return html
     raise ScraperBroken('Got PostgreSQL error %s times' % MAX_TRIES)

Ejemplo n.º 6

Mostrar archivo

Archivo: retrieval.py Proyecto: vijayaraju/everyblock-1

 def get_html(self, *args, **kwargs):
     MAX_TRIES = 4
     tries = 0
     while tries <= MAX_TRIES:
         html = NewsItemListDetailScraper.get_html(self, *args, **kwargs)
         if 'Unable to connect to PostgreSQL server' in html:
             self.logger.debug('Got "Unable to connect to PostgreSQL" error')
             time.sleep(3)
             continue
         return html
     raise ScraperBroken('Got PostgreSQL error %s times' % MAX_TRIES)

Ejemplo n.º 7

Mostrar archivo

Archivo: retrieval.py Proyecto: DotNetWebs/openblock

    def parse_list(self, record_html):
        # Normally this method gets passed raw html,
        # but we return both the html and the list_record from list_pages().
        list_record, html = record_html
        # a better version of the restaurant address is available on this page,
        # attempt to extract additional location details to resolve ambiguities.
        try:
            info = self.detail_address_re.search(html).groupdict()
            list_record['zipcode'] = info['zipcode']
        except:
            self.logger.info("Could not get detailed address information for record %s: %s" % (list_record['restaurant_id'], list_record['restaurant_name']))

        for record in NewsItemListDetailScraper.parse_list(self, html):
            yield dict(list_record, **record)

Ejemplo n.º 8

Mostrar archivo

Archivo: retrieval.py Proyecto: slinkp/openblock

    def parse_list(self, record_html):
        # Normally this method gets passed raw html,
        # but we return both the html and the list_record from list_pages().
        list_record, html = record_html
        # a better version of the restaurant address is available on this page,
        # attempt to extract additional location details to resolve ambiguities.
        try:
            info = self.detail_address_re.search(html).groupdict()
            list_record['zipcode'] = info['zipcode']
        except:
            self.logger.info(
                "Could not get detailed address information for record %s: %s"
                %
                (list_record['restaurant_id'], list_record['restaurant_name']))

        for record in NewsItemListDetailScraper.parse_list(self, html):
            yield dict(list_record, **record)

Ejemplo n.º 9

Mostrar archivo

 def __init__(self, filename=None):
     self.woodstock_filename = filename
     NewsItemListDetailScraper.__init__(self)

Ejemplo n.º 10

Mostrar archivo

Archivo: retrieval.py Proyecto: vijayaraju/everyblock-1

 def parse_list(self, record_html):
     list_record, html = record_html
     for record in NewsItemListDetailScraper.parse_list(self, html):
         yield dict(list_record, **record)

Ejemplo n.º 11

Mostrar archivo

Archivo: retrieval.py Proyecto: AndrewJHart/everyblock_code

 def __init__(self, filename=None):
     self.woodstock_filename = filename
     NewsItemListDetailScraper.__init__(self)

Ejemplo n.º 12

Mostrar archivo

Archivo: retrieval.py Proyecto: frankk00/openblock

 def parse_list(self, page):
     records = list(NewsItemListDetailScraper.parse_list(self, page))
     self.logger.debug('Got %s records', len(records))
     if len(records) >= 99:
         raise ScraperBroken('Got %s records. Consider changing date interval' % len(records))
     return records

Ejemplo n.º 13

Mostrar archivo

Archivo: retrieval.py Proyecto: vijayaraju/everyblock-1

 def __init__(self, start_date=None, end_date=None):
     NewsItemListDetailScraper.__init__(self)
     today = datetime.date.today()
     self.start_date = start_date or (today - datetime.timedelta(days=31))
     self.end_date = end_date or today

Ejemplo n.º 14

Mostrar archivo

Archivo: retrieval.py Proyecto: frankk00/openblock

 def __init__(self, start_date=None, end_date=None):
     NewsItemListDetailScraper.__init__(self, use_cache=False)
     if start_date is None:
         today = datetime.date.today()
         start_date, end_date = today - datetime.timedelta(days=5), today
     self.start_date, self.end_date = start_date, end_date

Ejemplo n.º 15

Mostrar archivo

 def parse_detail(self, page, list_record):
     # They use a ton of &nbsp;s for some reason, so convert them to spaces
     # to make the parse_detail_re regex more readable.
     page = page.replace('&nbsp;', ' ')
     return NewsItemListDetailScraper.parse_detail(self, page, list_record)

Ejemplo n.º 16

Mostrar archivo

 def __init__(self, start_date=None, end_date=None):
     NewsItemListDetailScraper.__init__(self, use_cache=False)
     if start_date is None:
         today = datetime.date.today()
         start_date, end_date = today - datetime.timedelta(days=5), today
     self.start_date, self.end_date = start_date, end_date

Ejemplo n.º 17

Mostrar archivo

 def parse_list(self, page):
     facility_type, html = page
     for record in NewsItemListDetailScraper.parse_list(self, html):
         yield dict(record, facility_type=facility_type)

Ejemplo n.º 18

Mostrar archivo

Archivo: retrieval.py Proyecto: christaggart/openblock

 def parse_list(self, record_html):
     list_record, html = record_html
     for record in NewsItemListDetailScraper.parse_list(self, html):
         yield dict(list_record, **record)

Ejemplo n.º 19

Mostrar archivo

Archivo: retrieval.py Proyecto: frankk00/openblock

 def __init__(self, start_date=None, end_date=None):
     NewsItemListDetailScraper.__init__(self)
     today = datetime.date.today()
     self.start_date = start_date or (today - datetime.timedelta(days=31))
     self.end_date = end_date or today

Ejemplo n.º 20

Mostrar archivo

Archivo: retrieval.py Proyecto: frankk00/openblock

 def parse_list(self, page):
     facility_type, html = page
     for record in NewsItemListDetailScraper.parse_list(self, html):
         yield dict(record, facility_type=facility_type)

Ejemplo n.º 21

Mostrar archivo

Archivo: retrieval.py Proyecto: vijayaraju/everyblock-1

 def __init__(self, hours=8, *args, **kwargs):
     self.num_hours = hours
     NewsItemListDetailScraper.__init__(self, *args, **kwargs)