def fetch_resource(self, r, key, is_binary, is_attrib_url): if is_attrib_url: resource_path = get_attribs_key_url(r.attrib, key) else: resource_path = r.attrib[key] if not (resource_path.startswith('http://') or resource_path.startswith('https://')): parsed = urlparse(self.page_url) resource_path = parsed.scheme + '://' + adjoin_paths(parsed.hostname, resource_path) for res in re.findall('/<!--.*?-->', resource_path): resource_path = resource_path.replace(res, '') resource_path = resource_path.replace('http:///', 'http://') resource_path = resource_path.replace('https:///', 'https://') print 'fetching :', resource_path file_content = get_markup(resource_path, source_url=self.page_url) print 'done : %s' % resource_path parsed = urlparse(resource_path) internal_path = adjoin_paths(self.site_path, parsed.hostname, parsed.path) ensure_dir_exists(os.path.dirname(internal_path)) if file_content: write_file(internal_path, file_content, is_binary) markup_path = self.markup_path(internal_path) if is_attrib_url: set_attribs_key_url(r.attrib, key, markup_path) else: r.attrib[key] = markup_path if internal_path.endswith('.css'): self.process_css_urls(internal_path, resource_path)
def _cl(r): if r.startswith('http://'): raise Exception('Unhandled css parse case') if r.startswith('https://'): raise Exception('Unhandled css parse case') online_path_file = adjoin_paths(os.path.dirname(online_css_path_file.split('?')[0]), r) online_file = os.path.basename(online_path_file).split('?')[0] local_path_file = adjoin_paths(os.path.dirname(local_css_path_file), r) ensure_dir_exists(os.path.dirname(local_path_file)) print 'fetching :', online_file for res in re.findall('/<!--.*?-->', online_file): online_file = online_file.replace(res, '') file_content = get_markup(online_path_file, source_url=self.page_url) print 'done : %s' % online_file if file_content: write_file(local_path_file, file_content, True)
def process_internal_asset(self, selector, asset_path, asset_file, t_tag, new_cl): res = self.tree_root.xpath(selector) file_content = u'\n\n'.join(u'\n'.join([l for l in (r.text if r.text else u'').split('\n')]).strip() for r in res) for r in res: remove_from_parent(r) r.text = None try: assert r.attrib.has_key('src') append_to_tag(self.tree_root, 'head', r) except: remove_from_parent(r) unique_file_name = uniquify_file_name(asset_path, asset_file) internal_path = adjoin_paths(asset_path, unique_file_name) ensure_dir_exists(asset_path) write_file(internal_path, file_content) append_to_tag(self.tree_root, t_tag, new_cl(self.markup_path(internal_path))) if asset_file.endswith('.css'): self.process_css_urls(internal_path, self.page_url)
def main_logic(self, open_when_done=False): print 'getting resources for page : %s' % self.page_url # process links, script sources and images, flash and other media if self.fetch_resouces: self.fetch_external_resources() # wait for all blocking IO threads to finish fetching external resources self.pool.waitall() # lorem ipsify text if self.randomize_text: self.process_text() # process imbedded css if self.process_embedded_css: self.process_internal_asset('//style', adjoin_paths(os.path.dirname(self.index_path), 'imbedded_css'), 'imbedded.css', 'head', create_css_link) # process inline js if self.process_inline_js: self.process_internal_asset('//body//script', adjoin_paths(os.path.dirname(self.index_path), 'inline_js'), 'inline.js', 'body', create_js_tag) # remove comments if self.remove_comments: self.filter_comments(True) # process noscript tags if self.remove_ns_tags: self.process_noscript_tags() # render html html_output = render_html_element(self.tree_root, format_html5=self.use_html5) #print html_output ensure_dir_exists(os.path.dirname(self.index_path)) write_file(self.index_path, html_output) print 'done getting resources : %s' % self.page_url if open_when_done: open_in_browser(self.index_path)