Ejemplo n.º 1
0
 def __init__(self,
              output_file=None,
              max_links=100,
              max_depth=1,
              allowed_urls='',
              banned_urls='^$',
              robots=None,
              crawl_existing=True):
     """
     `output_file' is where to save scraped data
     `max_links' is the maximum number of links to follow per page
     `max_depth' is the maximum depth to follow links into website (use None for no limit)
     `allowed_urls' is a regex for allowed urls, defaults to all urls
     `banned_urls' is a regex for banned urls, defaults to no urls
     `robots': RobotFileParser object to determine which urls allowed to crawl
     `crawl_existing' sets whether to crawl content already downloaded previously in the cache
     """
     if output_file:
         self.writer = common.UnicodeWriter(output_file)
     else:
         self.writer = None
     self.max_links = max_links
     self.max_depth = max_depth
     self.allowed_urls = re.compile(allowed_urls)
     self.banned_urls = re.compile(banned_urls)
     self.robots = robots
     self.crawl_existing = crawl_existing
Ejemplo n.º 2
0
 def __init__(self,
              output_file=None,
              max_links=100,
              max_depth=1,
              allowed_urls='',
              banned_urls='^$',
              robots=None,
              crawl_existing=True):
     """
     output_file:
         where to save scraped data
     max_links:
         the maximum number of links to follow per page
     max_depth:
         the maximum depth to follow links into website (use None for no limit)
     allowed_urls:
         a regex for allowed urls, defaults to all urls
     banned_urls:
         a regex for banned urls, defaults to no urls
     robots:
         RobotFileParser object to determine which urls allowed to crawl
     crawl_existing:
         sets whether to crawl content already downloaded previously in the cache
     """
     self.found = adt.HashDict(int)  # track depth of found URLs
     if output_file:
         self.writer = common.UnicodeWriter(output_file)
     else:
         self.writer = None
     self.max_links = max_links
     self.max_depth = max_depth
     self.allowed_urls = re.compile(allowed_urls)
     self.banned_urls = re.compile(banned_urls)
     self.robots = robots
     self.crawl_existing = crawl_existing
Ejemplo n.º 3
0
    def __init__(self, output_file, header):
        # load state from previous run, if exists
        state = self.load_state()
        # settings to start crawl from beginning
        self.new_urls = False
        write_header = True
        mode = 'wb'
        if StateCallback.active_urls:
            # incomplete crawl
            common.logger.info('Loading previous crawl state')
            self.new_urls = True
            if os.path.exists(output_file):
                mode = 'ab'
                write_header = False

        self.writer = common.UnicodeWriter(output_file, mode=mode)
        if write_header:
            self.writer.writerow(header)