Ejemplos de Crawler en Python

Lenguaje de programación: Python

Namespace/Package Name: pattern.web

Clase / Tipo: Crawler

Ejemplos en hotexamples.com: 6

Python Crawler - 6 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de pattern.web.Crawler extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

__init__(1)

crawl(1)

priority(1)

push(1)

Ejemplo n.º 1

Mostrar archivo

Archivo: crawler.py Proyecto: nkartashov/NKrawler

 def __init__(self, url, depth, index):
     Crawler.__init__(self, links=[url])
     re_text = u"""[абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЭЬЮЯ
     ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-]+"""
     self._clear_text_re = re.compile(re_text)
     self._index = index
     self._current_depth = -1
     self._depths = {self.next: 0}
     self._max_depth = depth

Ejemplo n.º 2

Mostrar archivo

Archivo: 13-crawler.py Proyecto: ADA110/Cibus

 def priority(self, link, method=DEPTH):
     if "?" in link.url:
         # This ignores links with a querystring.
         return 0.0
     else:
         # Otherwise use the default priority ranker,
         # i.e. the priority depends on DEPTH or BREADTH crawl mode.
         return Crawler.priority(self, link, method)

Ejemplo n.º 3

Mostrar archivo

 def __init__(self,
              links=[],
              domains=[],
              delay=20.0,
              parse=HTMLLinkParser().parse,
              sort=FIFO):
     #call super constructor
     Crawler.__init__(self, links, domains, delay, parse, sort)
     #save first link into root_url attribute
     self.root_url = links[0]
     self.crawl_id = save_crawl(self.root_url)
     # this will match on the end of rank urls like
     # http://palatinusbridge.hu/mezhon/eredmenyek/2014palaered/hetfo/ph140120.htm
     self.target_pattern = 'p\w\d{6}\.htm'
     # this will match on the end of day urls like
     # http://palatinusbridge.hu/mezhon/eredmenyek/2014palaered/hetfo/
     self.day_pattern = '[a-z]{4,9}/\Z'
     self.year_pattern = '[0-9]{4}palaered/\Z'

Ejemplo n.º 4

Mostrar archivo

Archivo: crawler.py Proyecto: nkartashov/NKrawler

 def crawl(self, method=BREADTH, **kwargs):
     next_link = self.next
     if next_link:
         self._current_depth = self._depths[next_link]
         print('Crawling %dth page at depth %d' % (len(self.visited), self._current_depth))
         try:
             return Crawler.crawl(self, method, **kwargs)
         except Exception as e:
             print('Ошибка при построении индекса: %s' % e)
     return False

Ejemplo n.º 5

Mostrar archivo

Archivo: crawler.py Proyecto: nkartashov/NKrawler

 def push(self, link, priority=1.0, sort=FILO):
     if inspect.stack()[2][3] == '__init__':
         Crawler.push(self, link, priority, sort)
     elif self._current_depth + 1 < self._max_depth:
         self._depths[link] = self._current_depth + 1
         Crawler.push(self, link, priority, sort)

Ejemplo n.º 6

Mostrar archivo

Archivo: crawler_07.py Proyecto: luislezcair/gisiaws

 def priority(self, link, method=None):
    #if "linkedin" in link.url or "twitter" in link.url or "facebook" in link.url or "google" in link.url:
    if self.badLink.detect(link.url):
        return 0.1
    else:
        return Crawler.priority(self, link, method)