def parse_css(doc, url): """ Returns (modified_doc, new_urls), where new_urls are absolute URLs for all links found in the CSS. """ global config new_urls = [] L = htmldata.urlextract(doc, url, "text/css") for item in L: # Store url locally. u = item.url if config.no_images and any(u.strip().lower().endswith(suffix) for suffix in (".jpg", ".gif", ".png", ".ico")): item.url = "" continue new_urls += [u] item.url = url_to_relative(u, url) newdoc = htmldata.urljoin(doc, L) newdoc = post_css_transform(newdoc, url) return (newdoc, new_urls)
def parse_html(doc, url, config): """ Returns (modified_doc, new_urls), where new_urls are absolute URLs for all links we want to spider in the HTML. """ BEGIN_COMMENT_REPLACE = '<BEGINCOMMENT-' + str(random.random()) + '>' END_COMMENT_REPLACE = '<ENDCOMMENT-' + str(random.random()) + '>' new_urls = [] # Temporarily "get rid" of comments so htmldata will find the URLs # in the funky "<!--[if" HTML hackery for IE. doc = doc.replace('<!--', BEGIN_COMMENT_REPLACE) doc = doc.replace('-->', END_COMMENT_REPLACE) L = htmldata.urlextract(doc, url, 'text/html') for item in L: u = item.url if should_follow(url, u): # Store url locally. new_urls += [u] item.url = url_to_relative(u, url, config) else: item.url = rewrite_external_url(item.url, config) newdoc = htmldata.urljoin(doc, L) newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--') newdoc = newdoc.replace(END_COMMENT_REPLACE, '-->') newdoc = post_html_transform(newdoc, url, config) return (newdoc, new_urls)
def parse_html(doc, url, config): """ Returns (modified_doc, new_urls), where new_urls are absolute URLs for all links we want to spider in the HTML. """ BEGIN_COMMENT_REPLACE = '<BEGINCOMMENT-' + str(random.random()) + '>' END_COMMENT_REPLACE = '<ENDCOMMENT-' + str(random.random()) + '>' new_urls = [] if TIDY: options = dict(output_xhtml=1, wrap=0) doc = str(tidy.parseString(doc, **options)) # Temporarily "get rid" of comments so htmldata will find the URLs # in the funky "<!--[if" HTML hackery for IE. doc = doc.replace('<!--', BEGIN_COMMENT_REPLACE) doc = doc.replace('-->', END_COMMENT_REPLACE) L = htmldata.urlextract(doc, url, 'text/html') for item in L: u = item.url if should_follow(url, u, config): # Store url locally. new_urls += [u] item.url = url_to_relative(u, url, config) else: item.url = rewrite_external_url(item.url, config) newdoc = htmldata.urljoin(doc, L) newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--') newdoc = newdoc.replace(END_COMMENT_REPLACE, '-->') newdoc = newdoc.replace('<br>', '<br/>') newdoc = post_html_transform(newdoc, url, config) return (newdoc, new_urls)
def parse_css(doc, url): """ Returns (modified_doc, new_urls), where new_urls are absolute URLs for all links found in the CSS. """ global config new_urls = [] L = htmldata.urlextract(doc, url, 'text/css') for item in L: # Store url locally. u = item.url if config.no_images and any(u.strip().lower().endswith(suffix) for suffix in ('.jpg', '.gif', '.png', '.ico')): item.url = '' continue new_urls += [u] item.url = url_to_relative(u, url) newdoc = htmldata.urljoin(doc, L) newdoc = post_css_transform(newdoc, url) return (newdoc, new_urls)
def crawl(self): # are there more pages to crawl ? if len(self.queue) == 0: return # task done # get a URL url = self.queue.pop() # process it only if it is a new one if url not in self.seen: self.seen.append(url) splitted_url = urlparse.urlsplit( url ) local_file = self.root + splitted_url.path (head,tail) = os.path.split( local_file ) if tail == '': tail = 'index.html' local_file = local_file + '/' + tail if os.path.isdir(local_file): local_file += '/index.html' if splitted_url.query: # query strings could contain anything, let's md5 = hashlib.md5() # not put them raw in filenames md5.update(splitted_url.query) local_file += '.' + md5.hexdigest() if os.path.exists( head ): # main logic for "unnamed pages" if os.path.isfile(head): os.rename( head, head + '.tmp') os.makedirs( head ) os.rename(head + '.tmp', head+ '/index.html') else: os.makedirs( head ) if self.verbose: print 'saving', url, 'to', local_file urllib.urlretrieve( url, local_file ) # is it HTML ? f = open( local_file, 'rb' ) try: # brutal, I know... for u in htmldata.urlextract(f.read(), url): # get rid of url "fragments" new_split = urlparse.urlparse(u.url) new_url = 'http://' + new_split.netloc + new_split.path if new_split.query: new_url += '?' + new_split.query if self.valid(new_url) and new_url not in self.seen and new_url not in self.queue: self.queue.append(new_url) except: if self.verbose: print url, 'is not HTML' # recurse if self.verbose: print "QUEUE LENGTH:", len(self.queue) self.crawl()
def parse_html(doc, url, filename): """ Returns (modified_doc, new_urls), where new_urls are absolute URLs for all links we want to spider in the HTML. """ global config global counter BEGIN_COMMENT_REPLACE = '<BEGINCOMMENT-' + str(random.random()) + '>' END_COMMENT_REPLACE = '<ENDCOMMENT-' + str(random.random()) + '>' new_urls = [] doc = pre_html_transform(doc, url) # Temporarily "get rid" of comments so htmldata will find the URLs # in the funky "<!--[if" HTML hackery for IE. doc = doc.replace('<!--', BEGIN_COMMENT_REPLACE) doc = doc.replace('-->', END_COMMENT_REPLACE) L = htmldata.urlextract(doc, url, 'text/html') # in this code we change each absolute url in L # into a relative one. # we also kick-off zillions of subthreads to collect # more pages. for item in L: u = item.url follow = should_follow(u) #and (counter < 10) if follow: if config.debug: print('ACCEPTED - ', u) # Store url locally. new_urls += [u] item.url = url_to_relative(u, url) else: # James, let's keep everything by default (but not follow it). # if not any( license in u for license in ('creativecommons.org', 'wxwidgets.org', 'gnu.org', 'mediawiki.org') ): # item.url = '' if config.debug: print('NOT INCLUDED - ', u) newdoc = htmldata.urljoin(doc, L) newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--') newdoc = newdoc.replace(END_COMMENT_REPLACE, '-->') newdoc = pos_html_transform(newdoc, url,filename) # Remove byte artifacts in string newdoc = newdoc.replace('\\n','\n') newdoc = newdoc.replace('\\t', '\t') newdoc = newdoc.strip('b') newdoc = newdoc.strip('') return (newdoc, new_urls)
def parse_html(doc, url, filename): """ Returns (modified_doc, new_urls), where new_urls are absolute URLs for all links we want to spider in the HTML. """ global config global counter BEGIN_COMMENT_REPLACE = '<BEGINCOMMENT-' + str(random.random()) + '>' END_COMMENT_REPLACE = '<ENDCOMMENT-' + str(random.random()) + '>' new_urls = [] doc = pre_html_transform(doc, url) # Temporarily "get rid" of comments so htmldata will find the URLs # in the funky "<!--[if" HTML hackery for IE. doc = doc.replace('<!--', BEGIN_COMMENT_REPLACE) doc = doc.replace('-->', END_COMMENT_REPLACE) L = htmldata.urlextract(doc, url, 'text/html') # in this code we change each absolute url in L # into a relative one. # we also kick-off zillions of subthreads to collect # more pages. for item in L: u = item.url follow = should_follow(u) #and (counter < 10) if follow: if config.debug: print 'ACCEPTED - ', u # Store url locally. new_urls += [u] item.url = url_to_relative(u, url) else: # James, let's keep everything by default (but not follow it). # if not any( license in u for license in ('creativecommons.org', 'wxwidgets.org', 'gnu.org', 'mediawiki.org') ): # item.url = '' if config.debug: print 'NOT INCLUDED - ', u newdoc = htmldata.urljoin(doc, L) newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--') newdoc = newdoc.replace(END_COMMENT_REPLACE, '-->') newdoc = pos_html_transform(newdoc, url,filename) return (newdoc, new_urls)
def parse_css(doc, url, config): """ Returns (modified_doc, new_urls), where new_urls are absolute URLs for all links found in the CSS. """ new_urls = [] L = htmldata.urlextract(doc, url, 'text/css') for item in L: # Store url locally. u = item.url new_urls += [u] item.url = url_to_relative(u, url, config) newdoc = htmldata.urljoin(doc, L) newdoc = post_css_transform(newdoc, url, config) return (newdoc, new_urls)
def parse_html(doc, url): """ Returns (modified_doc, new_urls), where new_urls are absolute URLs for all links we want to spider in the HTML. """ global config BEGIN_COMMENT_REPLACE = "<BEGINCOMMENT-" + str(random.random()) + ">" END_COMMENT_REPLACE = "<ENDCOMMENT-" + str(random.random()) + ">" new_urls = [] doc = pre_html_transform(doc, url) # Temporarily "get rid" of comments so htmldata will find the URLs # in the funky "<!--[if" HTML hackery for IE. doc = doc.replace("<!--", BEGIN_COMMENT_REPLACE) doc = doc.replace("-->", END_COMMENT_REPLACE) L = htmldata.urlextract(doc, url, "text/html") for item in L: u = item.url follow = should_follow(u) if follow: if config.debug: print "ACCEPTED - ", u # Store url locally. new_urls += [u] item.url = url_to_relative(u, url) else: if not any( license in u for license in ("creativecommons.org", "wxwidgets.org", "gnu.org", "mediawiki.org") ): item.url = "" if config.debug: print "DENIED - ", u newdoc = htmldata.urljoin(doc, L) newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, "<!--") newdoc = newdoc.replace(END_COMMENT_REPLACE, "-->") newdoc = pos_html_transform(newdoc, url) return (newdoc, new_urls)
def parse_html(doc, url): """ Returns (modified_doc, new_urls), where new_urls are absolute URLs for all links we want to spider in the HTML. """ global config BEGIN_COMMENT_REPLACE = '<BEGINCOMMENT-' + str(random.random()) + '>' END_COMMENT_REPLACE = '<ENDCOMMENT-' + str(random.random()) + '>' new_urls = [] doc = pre_html_transform(doc, url) # Temporarily "get rid" of comments so htmldata will find the URLs # in the funky "<!--[if" HTML hackery for IE. doc = doc.replace('<!--', BEGIN_COMMENT_REPLACE) doc = doc.replace('-->', END_COMMENT_REPLACE) L = htmldata.urlextract(doc, url, 'text/html') for item in L: u = item.url follow = should_follow(u) if follow: if config.debug: print 'ACCEPTED - ', u # Store url locally. new_urls += [u] item.url = url_to_relative(u, url) else: if not any(license in u for license in ('creativecommons.org', 'wxwidgets.org', 'gnu.org', 'mediawiki.org')): item.url = '' if config.debug: print 'DENIED - ', u newdoc = htmldata.urljoin(doc, L) newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--') newdoc = newdoc.replace(END_COMMENT_REPLACE, '-->') newdoc = pos_html_transform(newdoc, url) return (newdoc, new_urls)