def save_static_by_type(self, host, soup, link_items, attr, is_css = False): q = Queue.Queue() for link in link_items: print host.get_url() static_url_obj = HostURLParse(link[attr], host.get_url()) if not self.static_set.in_set(static_url_obj.get_url_hash()): #self.static_set.add(static_url_obj.get_url_hash()) q.put(static_url_obj) link[attr] = os.path.join(self.workspace, static_url_obj.get_diskrelpath()) else: logging.info('Found Static file in cache: ' + static_url_obj.get_url()) stop_event = threading.Event() if not q.empty(): for i in range(5): t = threading.Thread(target=self.static_worker, args=(q,stop_event,is_css)) t.setDaemon(True) t.start() q.join() stop_event.set()
def parse_and_enque(self, host, html): soup = BeautifulSoup.BeautifulSoup(html) links = [x for x in soup.findAll('a', href=True)] base = soup.find('base', href=True) base_url = '' if base: base_url = base['href'] base.extract() else: base_url = host.get_url() for a in links: #print a, host.get_url() urlhostp = HostURLParse(a['href'], base_url) h = urlhostp.get_url_hash() if not self.hash_set.in_set(h): print "Enquing", a['href'], base_url self.queue.put(urlhostp) self.hash_set.add(h) a['href'] = os.path.join(self. workspace, urlhostp. get_diskrelpath()) self.save_static_files(host,soup) return str(soup)
def static_worker(self, static_queue, stop_event, is_css=False): while not stop_event.is_set(): url_obj = static_queue.get() content = self.fetch_file(url_obj.get_url()) if content: self.static_set.add(url_obj.get_url_hash()) if is_css: prog = re.compile(r'url\(([a-zA-Z0-9_./]+)\)') res = prog.findall(content) for u in res: print url_obj.get_url() u_obj = HostURLParse(u, url_obj.get_url()) if not self.static_set.in_set(u_obj.get_url_hash()): static_queue.put(u_obj) abs_path = os.path.join(self.workspace, u_obj.get_diskrelpath()) content = re.sub(u, abs_path, content) logging.info("Saved static file: " + url_obj.get_url()) self.save_file(url_obj, content) static_queue.task_done()