def fetch_resource(self, r, key, is_binary, is_attrib_url): if is_attrib_url: resource_path = get_attribs_key_url(r.attrib, key) else: resource_path = r.attrib[key] if not (resource_path.startswith('http://') or resource_path.startswith('https://')): parsed = urlparse(self.page_url) resource_path = parsed.scheme + '://' + adjoin_paths(parsed.hostname, resource_path) for res in re.findall('/<!--.*?-->', resource_path): resource_path = resource_path.replace(res, '') resource_path = resource_path.replace('http:///', 'http://') resource_path = resource_path.replace('https:///', 'https://') print 'fetching :', resource_path file_content = get_markup(resource_path, source_url=self.page_url) print 'done : %s' % resource_path parsed = urlparse(resource_path) internal_path = adjoin_paths(self.site_path, parsed.hostname, parsed.path) ensure_dir_exists(os.path.dirname(internal_path)) if file_content: write_file(internal_path, file_content, is_binary) markup_path = self.markup_path(internal_path) if is_attrib_url: set_attribs_key_url(r.attrib, key, markup_path) else: r.attrib[key] = markup_path if internal_path.endswith('.css'): self.process_css_urls(internal_path, resource_path)
def _cl(r): if r.startswith('http://'): raise Exception('Unhandled css parse case') if r.startswith('https://'): raise Exception('Unhandled css parse case') online_path_file = adjoin_paths(os.path.dirname(online_css_path_file.split('?')[0]), r) online_file = os.path.basename(online_path_file).split('?')[0] local_path_file = adjoin_paths(os.path.dirname(local_css_path_file), r) ensure_dir_exists(os.path.dirname(local_path_file)) print 'fetching :', online_file for res in re.findall('/<!--.*?-->', online_file): online_file = online_file.replace(res, '') file_content = get_markup(online_path_file, source_url=self.page_url) print 'done : %s' % online_file if file_content: write_file(local_path_file, file_content, True)
def crawl_page(self, page_url, depth): if depth == 0: return is_home_page = page_url == self.home_page_url page_url = (page_url + '/') if not page_url.endswith('/') else page_url if page_url in self.being_crawled: return print '\ncrawling page : %s' % page_url self.being_crawled.add(page_url) p = SitePage(page_url, self.site_path, process_inline_js=self.process_inline_js, process_embedded_css=self.process_embedded_css, fetch_resources=self.fetch_resources, remove_comments=self.remove_comments, remove_ns_tags=self.remove_ns_tags, randomize_text=self.randomize_text) ast = p.parse_markup() for a in ast.xpath('//a'): if not a.attrib.has_key('href'): continue a_url = a.attrib['href'].strip() if urlparse(page_url).hostname == urlparse(a_url).hostname: self.pool.spawn(self.crawl_page, a_url, depth-1) parsed = urlparse(a_url) p1 = adjoin_paths(self.site_path, parsed.hostname, parsed.path, 'index.html') a.attrib['href'] = p.markup_path(p1) else: a.attrib['href'] = '#' #print '>>> ', a.attrib['href'] self.pool.spawn(p.main_logic, open_when_done=self.open_home_page_in_browser and is_home_page) return
def main_logic(self, open_when_done=False): print 'getting resources for page : %s' % self.page_url # process links, script sources and images, flash and other media if self.fetch_resouces: self.fetch_external_resources() # wait for all blocking IO threads to finish fetching external resources self.pool.waitall() # lorem ipsify text if self.randomize_text: self.process_text() # process imbedded css if self.process_embedded_css: self.process_internal_asset('//style', adjoin_paths(os.path.dirname(self.index_path), 'imbedded_css'), 'imbedded.css', 'head', create_css_link) # process inline js if self.process_inline_js: self.process_internal_asset('//body//script', adjoin_paths(os.path.dirname(self.index_path), 'inline_js'), 'inline.js', 'body', create_js_tag) # remove comments if self.remove_comments: self.filter_comments(True) # process noscript tags if self.remove_ns_tags: self.process_noscript_tags() # render html html_output = render_html_element(self.tree_root, format_html5=self.use_html5) #print html_output ensure_dir_exists(os.path.dirname(self.index_path)) write_file(self.index_path, html_output) print 'done getting resources : %s' % self.page_url if open_when_done: open_in_browser(self.index_path)
def __init__(self, home_page_url, site_name, www_path, depth=1, process_inline_js=True, process_embedded_css=True, fetch_resources=True, remove_comments=True, remove_ns_tags=True, open_home_page_in_browser=False, randomize_text=False): self.home_page_url = home_page_url self.depth = depth self.site_name = site_name self.site_path = adjoin_paths(os.path.realpath(www_path), self.site_name) self.pool = eventlet.GreenPool(10000) self.being_crawled = set([]) self.process_inline_js = process_inline_js self.process_embedded_css = process_embedded_css self.fetch_resources = fetch_resources self.remove_comments = remove_comments self.remove_ns_tags = remove_ns_tags self.open_home_page_in_browser = open_home_page_in_browser self.randomize_text = randomize_text
def __init__(self, page_url, site_path, use_html5=False, process_inline_js=True, process_embedded_css=True, fetch_resources=True, remove_comments=True, remove_ns_tags=True, randomize_text=False): self.pool = eventlet.GreenPool(10000) self.page_url = page_url self.site_path = site_path self.tree_root = None self.use_html5 = use_html5 parsed = urlparse(self.page_url) self.index_path = adjoin_paths(self.site_path, parsed.hostname, parsed.path, 'index.html') self.process_inline_js = process_inline_js self.process_embedded_css = process_embedded_css self.fetch_resouces = fetch_resources self.remove_comments = remove_comments self.remove_ns_tags = remove_ns_tags self.randomize_text = randomize_text
def process_internal_asset(self, selector, asset_path, asset_file, t_tag, new_cl): res = self.tree_root.xpath(selector) file_content = u'\n\n'.join(u'\n'.join([l for l in (r.text if r.text else u'').split('\n')]).strip() for r in res) for r in res: remove_from_parent(r) r.text = None try: assert r.attrib.has_key('src') append_to_tag(self.tree_root, 'head', r) except: remove_from_parent(r) unique_file_name = uniquify_file_name(asset_path, asset_file) internal_path = adjoin_paths(asset_path, unique_file_name) ensure_dir_exists(asset_path) write_file(internal_path, file_content) append_to_tag(self.tree_root, t_tag, new_cl(self.markup_path(internal_path))) if asset_file.endswith('.css'): self.process_css_urls(internal_path, self.page_url)
def open_in_browser(path): url = adjoin_paths('file://', path) webbrowser.open(url)