def request_uniq(self): if not self.is_data_loaded: self.load_data() # fill queue with only unique urls for i in self.get_unique_urls(): self.queue.put({"host": self.host, "url": i}) # add 'None' to queue - stops threads when no items are left for i in range(self.threads): self.queue.put(None) # start the threads for i in range(self.threads): w = RequesterThread(i, self.queue, self.cache, self.requested) w.daemon = True self.workers.append(w) w.start() # join when all work is done self.queue.join() # convert queue to list # pair items from self.get_all_items with the urls requested result_list = [] all_items = self.get_all_items() while not self.requested.empty(): url, response = self.requested.get() for item in all_items: if item['url'] == url: item["response"] = response result_list.append(item) return result_list
def run(self): for req in self.cache.get_responses(): # only scrape pages that can contain links/references if 'text/html' in req.headers['content-type']: self.parser.feed(str(req.content)) for i in self.parser.get_results(): # ensure that only ressources located on the domain /sub-domain is requested if i.startswith('http'): parts = i.split('/') host = parts[2] # if the ressource is out side of the domain, skip it if not host in self.host.split('/')[2]: continue # else update the url so that it only contains the relative location else: i = '/'.join(parts[3:]) self.queue.put({"host": self.host, "url": i}) # add 'None' to queue - stops threads when no items are left for i in range(self.threads): self.queue.put(None) # start the threads for i in range(self.threads): w = RequesterThread(i, self.queue, self.cache, self.results) w.daemon = True self.workers.append(w) w.start() self.queue.join()
def run(self): for req in self.cache.get_responses(): # only scrape pages that can contain links/references if 'text/html' in req.headers['content-type']: self.parser.feed(str(req.content)) for i in self.parser.get_results(): # ensure that only ressources located on the domain /sub-domain is requested if i.startswith('http'): parts = i.split('/') host = parts[2] # if the ressource is out side of the domain, skip it if not host in self.host.split('/')[2]: continue # else update the url so that it only contains the relative location else: i = '/'.join(parts[3:]) self.queue.put( {"host": self.host, "url": i} ) # add 'None' to queue - stops threads when no items are left for i in range(self.threads): self.queue.put( None ) # start the threads for i in range(self.threads): w = RequesterThread(i, self.queue, self.cache, self.results) w.daemon = True self.workers.append(w) w.start() self.queue.join()
def request_uniq(self): if not self.is_data_loaded: self.load_data() # fill queue with only unique urls for i in self.get_unique_urls(): self.queue.put( {"host": self.host, "url": i} ) # add 'None' to queue - stops threads when no items are left for i in range(self.threads): self.queue.put( None ) # start the threads for i in range(self.threads): w = RequesterThread(i, self.queue, self.cache, self.requested) w.daemon = True self.workers.append(w) w.start() # join when all work is done self.queue.join() # convert queue to list # pair items from self.get_all_items with the urls requested result_list = [] all_items = self.get_all_items() while not self.requested.empty(): url, response = self.requested.get() for item in all_items: if item['url'] == url: item["response"] = response result_list.append( item ) return result_list