Example #1
0
def parse_css(doc, url):
    """
  Returns (modified_doc, new_urls), where new_urls are absolute URLs for
  all links found in the CSS.
  """
    global config

    new_urls = []

    L = htmldata.urlextract(doc, url, "text/css")
    for item in L:
        # Store url locally.
        u = item.url

        if config.no_images and any(u.strip().lower().endswith(suffix) for suffix in (".jpg", ".gif", ".png", ".ico")):
            item.url = ""
            continue

        new_urls += [u]
        item.url = url_to_relative(u, url)

    newdoc = htmldata.urljoin(doc, L)
    newdoc = post_css_transform(newdoc, url)

    return (newdoc, new_urls)
Example #2
0
def parse_html(doc, url, config):
  """
  Returns (modified_doc, new_urls), where new_urls are absolute URLs for
  all links we want to spider in the HTML.
  """
  BEGIN_COMMENT_REPLACE = '<BEGINCOMMENT-' + str(random.random()) + '>'
  END_COMMENT_REPLACE   = '<ENDCOMMENT-' + str(random.random()) + '>'

  new_urls = []  

  # Temporarily "get rid" of comments so htmldata will find the URLs
  # in the funky "<!--[if" HTML hackery for IE.
  doc = doc.replace('<!--', BEGIN_COMMENT_REPLACE)
  doc = doc.replace('-->', END_COMMENT_REPLACE)

  L = htmldata.urlextract(doc, url, 'text/html')
  for item in L:
    u = item.url
    if should_follow(url, u):
      # Store url locally.
      new_urls += [u]
      item.url = url_to_relative(u, url, config)
    else:
      item.url = rewrite_external_url(item.url, config)

  newdoc = htmldata.urljoin(doc, L)
  newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--')
  newdoc = newdoc.replace(END_COMMENT_REPLACE, '-->')
  newdoc = post_html_transform(newdoc, url, config)

  return (newdoc, new_urls)
def parse_html(doc, url, config):
    """
  Returns (modified_doc, new_urls), where new_urls are absolute URLs for
  all links we want to spider in the HTML.
  """
    BEGIN_COMMENT_REPLACE = '<BEGINCOMMENT-' + str(random.random()) + '>'
    END_COMMENT_REPLACE = '<ENDCOMMENT-' + str(random.random()) + '>'

    new_urls = []
    if TIDY:
        options = dict(output_xhtml=1, wrap=0)
        doc = str(tidy.parseString(doc, **options))

    # Temporarily "get rid" of comments so htmldata will find the URLs
    # in the funky "<!--[if" HTML hackery for IE.
    doc = doc.replace('<!--', BEGIN_COMMENT_REPLACE)
    doc = doc.replace('-->', END_COMMENT_REPLACE)

    L = htmldata.urlextract(doc, url, 'text/html')
    for item in L:
        u = item.url
        if should_follow(url, u, config):
            # Store url locally.
            new_urls += [u]
            item.url = url_to_relative(u, url, config)
        else:
            item.url = rewrite_external_url(item.url, config)

    newdoc = htmldata.urljoin(doc, L)
    newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--')
    newdoc = newdoc.replace(END_COMMENT_REPLACE, '-->')
    newdoc = newdoc.replace('<br>', '<br/>')
    newdoc = post_html_transform(newdoc, url, config)
    return (newdoc, new_urls)
Example #4
0
def parse_css(doc, url):
    """
    Returns (modified_doc, new_urls), where new_urls are absolute URLs for
    all links found in the CSS.
    """
    global config

    new_urls = []

    L = htmldata.urlextract(doc, url, 'text/css')
    for item in L:
        # Store url locally.
        u = item.url

        if config.no_images and any(u.strip().lower().endswith(suffix) for suffix in ('.jpg', '.gif', '.png', '.ico')):
            item.url = ''
            continue

        new_urls += [u]
        item.url = url_to_relative(u, url)

    newdoc = htmldata.urljoin(doc, L)
    newdoc = post_css_transform(newdoc, url)

    return (newdoc, new_urls)
    def crawl(self):

        # are there more pages to crawl ?
        if len(self.queue) == 0:
            return # task done

        # get a URL
        url = self.queue.pop()

        # process it only if it is a new one
        if url not in self.seen:
            self.seen.append(url)
            splitted_url = urlparse.urlsplit( url )
            local_file = self.root + splitted_url.path
            (head,tail) = os.path.split( local_file )
            if tail == '':
                tail = 'index.html'
                local_file = local_file + '/' + tail
            if os.path.isdir(local_file):
                local_file += '/index.html'

            if splitted_url.query:  # query strings could contain anything, let's      
                md5 = hashlib.md5() # not put them raw in filenames
                md5.update(splitted_url.query)
                local_file += '.' + md5.hexdigest()
                
            if os.path.exists( head ): # main logic for "unnamed pages"
                if os.path.isfile(head):
                    os.rename( head, head + '.tmp')
                    os.makedirs( head )
                    os.rename(head + '.tmp', head+ '/index.html')
            else:
                os.makedirs( head )

            if self.verbose: print 'saving', url, 'to', local_file
            urllib.urlretrieve( url, local_file )

            # is it HTML ?
            f = open( local_file, 'rb' )
            try: # brutal, I know...
                for u in htmldata.urlextract(f.read(), url):
                    # get rid of url "fragments"
                    new_split = urlparse.urlparse(u.url)
                    new_url = 'http://' + new_split.netloc + new_split.path
                    if new_split.query:
                        new_url += '?' + new_split.query
                    if self.valid(new_url) and new_url not in self.seen and new_url not in self.queue:
                        self.queue.append(new_url)
            except:
                if self.verbose:
                    print url, 'is not HTML'

        # recurse
        if self.verbose: print "QUEUE LENGTH:", len(self.queue)
        self.crawl()
Example #6
0
def parse_html(doc, url, filename):
    """
    Returns (modified_doc, new_urls), where new_urls are absolute URLs for
    all links we want to spider in the HTML.
    """
    global config
    global counter

    BEGIN_COMMENT_REPLACE = '<BEGINCOMMENT-' + str(random.random()) + '>'
    END_COMMENT_REPLACE = '<ENDCOMMENT-' + str(random.random()) + '>'

    new_urls = []

    doc = pre_html_transform(doc, url)
    # Temporarily "get rid" of comments so htmldata will find the URLs
    # in the funky "<!--[if" HTML hackery for IE.
    doc = doc.replace('<!--', BEGIN_COMMENT_REPLACE)
    doc = doc.replace('-->', END_COMMENT_REPLACE)


    L = htmldata.urlextract(doc, url, 'text/html')

    # in this code we change each absolute url in L
    # into a relative one.
    # we also kick-off zillions of subthreads to collect 
    # more pages.
    for item in L:
        u = item.url
        follow = should_follow(u) #and (counter < 10)
        if follow:
            if config.debug:
                print('ACCEPTED   - ', u)
            # Store url locally.
            new_urls += [u]
            item.url = url_to_relative(u, url)
        else:
            # James, let's keep everything by default (but not follow it).
            # if not any( license in u for license in ('creativecommons.org', 'wxwidgets.org', 'gnu.org', 'mediawiki.org') ):
            #  item.url = ''
            if config.debug:
                print('NOT INCLUDED     - ', u)

    newdoc = htmldata.urljoin(doc, L)
    newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--')
    newdoc = newdoc.replace(END_COMMENT_REPLACE, '-->')

    newdoc = pos_html_transform(newdoc, url,filename)

    # Remove byte artifacts in string
    newdoc = newdoc.replace('\\n','\n')
    newdoc = newdoc.replace('\\t', '\t')
    newdoc = newdoc.strip('b')
    newdoc = newdoc.strip('')

    return (newdoc, new_urls)
Example #7
0
def parse_html(doc, url, filename):
    """
    Returns (modified_doc, new_urls), where new_urls are absolute URLs for
    all links we want to spider in the HTML.
    """
    global config
    global counter

    BEGIN_COMMENT_REPLACE = '<BEGINCOMMENT-' + str(random.random()) + '>'
    END_COMMENT_REPLACE = '<ENDCOMMENT-' + str(random.random()) + '>'

    new_urls = []

    doc = pre_html_transform(doc, url)
    # Temporarily "get rid" of comments so htmldata will find the URLs
    # in the funky "<!--[if" HTML hackery for IE.
    doc = doc.replace('<!--', BEGIN_COMMENT_REPLACE)
    doc = doc.replace('-->', END_COMMENT_REPLACE)


    L = htmldata.urlextract(doc, url, 'text/html')

    # in this code we change each absolute url in L
    # into a relative one.
    # we also kick-off zillions of subthreads to collect 
    # more pages.
    for item in L:
        u = item.url
        follow = should_follow(u) #and (counter < 10)
        if follow:
            if config.debug:
                print 'ACCEPTED   - ', u
            # Store url locally.
            new_urls += [u]
            item.url = url_to_relative(u, url)
        else:
            # James, let's keep everything by default (but not follow it).
            # if not any( license in u for license in ('creativecommons.org', 'wxwidgets.org', 'gnu.org', 'mediawiki.org') ):
            #  item.url = ''
            if config.debug:
                print 'NOT INCLUDED     - ', u

    newdoc = htmldata.urljoin(doc, L)
    newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--')
    newdoc = newdoc.replace(END_COMMENT_REPLACE, '-->')

    newdoc = pos_html_transform(newdoc, url,filename)

    return (newdoc, new_urls)
def parse_css(doc, url, config):
    """
  Returns (modified_doc, new_urls), where new_urls are absolute URLs for
  all links found in the CSS.
  """
    new_urls = []

    L = htmldata.urlextract(doc, url, 'text/css')
    for item in L:
        # Store url locally.
        u = item.url
        new_urls += [u]
        item.url = url_to_relative(u, url, config)

    newdoc = htmldata.urljoin(doc, L)
    newdoc = post_css_transform(newdoc, url, config)

    return (newdoc, new_urls)
Example #9
0
def parse_css(doc, url, config):
  """
  Returns (modified_doc, new_urls), where new_urls are absolute URLs for
  all links found in the CSS.
  """
  new_urls = []  

  L = htmldata.urlextract(doc, url, 'text/css')
  for item in L:
    # Store url locally.
    u = item.url
    new_urls += [u]
    item.url = url_to_relative(u, url, config)

  newdoc = htmldata.urljoin(doc, L)
  newdoc = post_css_transform(newdoc, url, config)

  return (newdoc, new_urls)
Example #10
0
def parse_html(doc, url):
    """
  Returns (modified_doc, new_urls), where new_urls are absolute URLs for
  all links we want to spider in the HTML.
  """
    global config

    BEGIN_COMMENT_REPLACE = "<BEGINCOMMENT-" + str(random.random()) + ">"
    END_COMMENT_REPLACE = "<ENDCOMMENT-" + str(random.random()) + ">"

    new_urls = []

    doc = pre_html_transform(doc, url)
    # Temporarily "get rid" of comments so htmldata will find the URLs
    # in the funky "<!--[if" HTML hackery for IE.
    doc = doc.replace("<!--", BEGIN_COMMENT_REPLACE)
    doc = doc.replace("-->", END_COMMENT_REPLACE)

    L = htmldata.urlextract(doc, url, "text/html")

    for item in L:
        u = item.url
        follow = should_follow(u)
        if follow:
            if config.debug:
                print "ACCEPTED   - ", u
            # Store url locally.
            new_urls += [u]
            item.url = url_to_relative(u, url)
        else:
            if not any(
                license in u for license in ("creativecommons.org", "wxwidgets.org", "gnu.org", "mediawiki.org")
            ):
                item.url = ""
            if config.debug:
                print "DENIED     - ", u

    newdoc = htmldata.urljoin(doc, L)
    newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, "<!--")
    newdoc = newdoc.replace(END_COMMENT_REPLACE, "-->")

    newdoc = pos_html_transform(newdoc, url)

    return (newdoc, new_urls)
Example #11
0
def parse_html(doc, url):
    """
  Returns (modified_doc, new_urls), where new_urls are absolute URLs for
  all links we want to spider in the HTML.
  """
    global config

    BEGIN_COMMENT_REPLACE = '<BEGINCOMMENT-' + str(random.random()) + '>'
    END_COMMENT_REPLACE = '<ENDCOMMENT-' + str(random.random()) + '>'

    new_urls = []

    doc = pre_html_transform(doc, url)
    # Temporarily "get rid" of comments so htmldata will find the URLs
    # in the funky "<!--[if" HTML hackery for IE.
    doc = doc.replace('<!--', BEGIN_COMMENT_REPLACE)
    doc = doc.replace('-->', END_COMMENT_REPLACE)

    L = htmldata.urlextract(doc, url, 'text/html')

    for item in L:
        u = item.url
        follow = should_follow(u)
        if follow:
            if config.debug:
                print 'ACCEPTED   - ', u
            # Store url locally.
            new_urls += [u]
            item.url = url_to_relative(u, url)
        else:
            if not any(license in u
                       for license in ('creativecommons.org', 'wxwidgets.org',
                                       'gnu.org', 'mediawiki.org')):
                item.url = ''
            if config.debug:
                print 'DENIED     - ', u

    newdoc = htmldata.urljoin(doc, L)
    newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--')
    newdoc = newdoc.replace(END_COMMENT_REPLACE, '-->')

    newdoc = pos_html_transform(newdoc, url)

    return (newdoc, new_urls)