Ejemplo n.º 1
0
 def __init__(self, start_url, sitemap_url=None):
     self.visited_urls = set()
     self.intermediate_urls = set()
     self.logger = logging.getLogger(__name__)
     self.base_domain = extract_domain(start_url)
     self.base_site = extract_base_site(start_url)
     self.non_visited_urls = {
         _get_client_page(start_url, None, start_url, self.base_domain, DOMAINS_TO_BE_SKIPPED)}
     self.added_count = 1
     self.idle_ping = 0
     # self.coop = task.Cooperator()
     self.start_idle_counter = False
     self.sitemap_url = '{}/sitemap.xml'.format(self.base_site) if not sitemap_url else sitemap_url
Ejemplo n.º 2
0
    def __init__(self, start_url, sitemap_url=None, max_concurrent_connections=MAX_CONCURRENT_REQUESTS_PER_SERVER):

        self.visited_urls = set()
        self.intermediate_urls = set()
        self.base_domain = extract_domain(start_url)
        self.base_site = extract_base_site(start_url)
        self.base_page = _get_client_page(start_url, None, start_url, self.base_domain, DOMAINS_TO_BE_SKIPPED)
        self.non_visited_urls = {self.base_page}
        self.added_count = 1
        self.idle_ping = 0
        self.start_idle_counter = False
        self.sitemap_url = u'{}/sitemap.xml'.format(self.base_site) if not sitemap_url else sitemap_url
        self.max_concurrent_connections = max_concurrent_connections

        self.page_queue = JoinableQueue()
        self.semaphore = BoundedSemaphore(self.max_concurrent_connections)
        self.start = time.time()
        self.skip_count = 0