Ejemplo n.º 1
0
 def __init__(self, url, depth, index):
     Crawler.__init__(self, links=[url])
     re_text = u"""[абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЭЬЮЯ
     ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-]+"""
     self._clear_text_re = re.compile(re_text)
     self._index = index
     self._current_depth = -1
     self._depths = {self.next: 0}
     self._max_depth = depth
Ejemplo n.º 2
0
 def priority(self, link, method=DEPTH):
     if "?" in link.url:
         # This ignores links with a querystring.
         return 0.0
     else:
         # Otherwise use the default priority ranker,
         # i.e. the priority depends on DEPTH or BREADTH crawl mode.
         return Crawler.priority(self, link, method)
Ejemplo n.º 3
0
 def __init__(self,
              links=[],
              domains=[],
              delay=20.0,
              parse=HTMLLinkParser().parse,
              sort=FIFO):
     #call super constructor
     Crawler.__init__(self, links, domains, delay, parse, sort)
     #save first link into root_url attribute
     self.root_url = links[0]
     self.crawl_id = save_crawl(self.root_url)
     # this will match on the end of rank urls like
     # http://palatinusbridge.hu/mezhon/eredmenyek/2014palaered/hetfo/ph140120.htm
     self.target_pattern = 'p\w\d{6}\.htm'
     # this will match on the end of day urls like
     # http://palatinusbridge.hu/mezhon/eredmenyek/2014palaered/hetfo/
     self.day_pattern = '[a-z]{4,9}/\Z'
     self.year_pattern = '[0-9]{4}palaered/\Z'
Ejemplo n.º 4
0
 def crawl(self, method=BREADTH, **kwargs):
     next_link = self.next
     if next_link:
         self._current_depth = self._depths[next_link]
         print('Crawling %dth page at depth %d' % (len(self.visited), self._current_depth))
         try:
             return Crawler.crawl(self, method, **kwargs)
         except Exception as e:
             print('Ошибка при построении индекса: %s' % e)
     return False
Ejemplo n.º 5
0
 def push(self, link, priority=1.0, sort=FILO):
     if inspect.stack()[2][3] == '__init__':
         Crawler.push(self, link, priority, sort)
     elif self._current_depth + 1 < self._max_depth:
         self._depths[link] = self._current_depth + 1
         Crawler.push(self, link, priority, sort)
Ejemplo n.º 6
0
 def priority(self, link, method=None):
    #if "linkedin" in link.url or "twitter" in link.url or "facebook" in link.url or "google" in link.url:
    if self.badLink.detect(link.url):
        return 0.1
    else:
        return Crawler.priority(self, link, method)