Python ensure_dir_exists Exemples, utils.file_system.ensure_dir_exists Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : hc.py Projet : rabihkodeih/htmlcrawler

 def fetch_resource(self, r, key, is_binary, is_attrib_url):
     if is_attrib_url:
         resource_path = get_attribs_key_url(r.attrib, key)
     else:
         resource_path = r.attrib[key]
     if not (resource_path.startswith('http://') or resource_path.startswith('https://')):
         parsed = urlparse(self.page_url)
         resource_path = parsed.scheme + '://' + adjoin_paths(parsed.hostname, resource_path)
     for res in re.findall('/<!--.*?-->', resource_path): resource_path = resource_path.replace(res, '')
     resource_path = resource_path.replace('http:///', 'http://')
     resource_path = resource_path.replace('https:///', 'https://')
     print 'fetching :', resource_path
     file_content = get_markup(resource_path, source_url=self.page_url)
     print 'done     : %s' % resource_path
     parsed = urlparse(resource_path)
     internal_path = adjoin_paths(self.site_path, parsed.hostname, parsed.path)
     ensure_dir_exists(os.path.dirname(internal_path))
     if file_content: write_file(internal_path, file_content, is_binary)
     markup_path = self.markup_path(internal_path)
     if is_attrib_url:
         set_attribs_key_url(r.attrib, key, markup_path)
     else:
         r.attrib[key] = markup_path
     if internal_path.endswith('.css'):
         self.process_css_urls(internal_path, resource_path)

Exemple #2

0

Afficher le fichier

Fichier : hc.py Projet : rabihkodeih/htmlcrawler

 def _cl(r):
     if r.startswith('http://'): raise Exception('Unhandled css parse case')
     if r.startswith('https://'): raise Exception('Unhandled css parse case')
     online_path_file = adjoin_paths(os.path.dirname(online_css_path_file.split('?')[0]), r)
     online_file = os.path.basename(online_path_file).split('?')[0]
     local_path_file = adjoin_paths(os.path.dirname(local_css_path_file), r)
     ensure_dir_exists(os.path.dirname(local_path_file))
     print 'fetching :', online_file
     for res in re.findall('/<!--.*?-->', online_file): online_file = online_file.replace(res, '')
     file_content = get_markup(online_path_file, source_url=self.page_url)
     print 'done     : %s' % online_file
     if file_content: write_file(local_path_file, file_content, True)

Exemple #3

0

Afficher le fichier

Fichier : hc.py Projet : rabihkodeih/htmlcrawler

 def process_internal_asset(self, selector, asset_path, asset_file, t_tag, new_cl): 
     res = self.tree_root.xpath(selector)
     file_content = u'\n\n'.join(u'\n'.join([l for l in (r.text if r.text else u'').split('\n')]).strip() for r in res)
     for r in res: 
         remove_from_parent(r)
         r.text = None
         try: 
             assert r.attrib.has_key('src')
             append_to_tag(self.tree_root, 'head', r)
         except:
             remove_from_parent(r)
     unique_file_name = uniquify_file_name(asset_path, asset_file)
     internal_path = adjoin_paths(asset_path, unique_file_name)
     ensure_dir_exists(asset_path)
     write_file(internal_path, file_content)
     append_to_tag(self.tree_root, t_tag, new_cl(self.markup_path(internal_path)))
     if asset_file.endswith('.css'):
         self.process_css_urls(internal_path, self.page_url)

Exemple #4

0

Afficher le fichier

Fichier : hc.py Projet : rabihkodeih/htmlcrawler

    def main_logic(self, open_when_done=False):
        print 'getting resources for page  : %s' % self.page_url
        # process links, script sources and images, flash and other media
        if self.fetch_resouces:
            self.fetch_external_resources()

        # wait for all blocking IO threads to finish fetching external resources
        self.pool.waitall()

        # lorem ipsify text
        if self.randomize_text:
            self.process_text()

        # process imbedded css
        if self.process_embedded_css:
            self.process_internal_asset('//style', adjoin_paths(os.path.dirname(self.index_path), 
                                                            'imbedded_css'), 'imbedded.css', 'head', create_css_link)     
        
        # process inline js
        if self.process_inline_js:
            self.process_internal_asset('//body//script', adjoin_paths(os.path.dirname(self.index_path), 
                                                            'inline_js'), 'inline.js', 'body', create_js_tag)
        
        # remove comments
        if self.remove_comments:
            self.filter_comments(True)
        
        # process noscript tags
        if self.remove_ns_tags:
            self.process_noscript_tags()
        
        # render html
        html_output = render_html_element(self.tree_root, format_html5=self.use_html5)
        #print html_output
        
        ensure_dir_exists(os.path.dirname(self.index_path))
        write_file(self.index_path, html_output)

        print 'done getting resources      : %s' % self.page_url
        if open_when_done: open_in_browser(self.index_path)