Ejemplo n.º 1
def parse_html(doc, url, config):
  Returns (modified_doc, new_urls), where new_urls are absolute URLs for
  all links we want to spider in the HTML.
    BEGIN_COMMENT_REPLACE = '<BEGINCOMMENT-' + str(random.random()) + '>'
    END_COMMENT_REPLACE = '<ENDCOMMENT-' + str(random.random()) + '>'

    new_urls = []
    if TIDY:
        options = dict(output_xhtml=1, wrap=0)
        doc = str(tidy.parseString(doc, **options))

    # Temporarily "get rid" of comments so htmldata will find the URLs
    # in the funky "<!--[if" HTML hackery for IE.
    doc = doc.replace('<!--', BEGIN_COMMENT_REPLACE)
    doc = doc.replace('-->', END_COMMENT_REPLACE)

    L = htmldata.urlextract(doc, url, 'text/html')
    for item in L:
        u = item.url
        if should_follow(url, u, config):
            # Store url locally.
            new_urls += [u]
            item.url = url_to_relative(u, url, config)
            item.url = rewrite_external_url(item.url, config)

    newdoc = htmldata.urljoin(doc, L)
    newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--')
    newdoc = newdoc.replace(END_COMMENT_REPLACE, '-->')
    newdoc = newdoc.replace('<br>', '<br/>')
    newdoc = post_html_transform(newdoc, url, config)
    return (newdoc, new_urls)
Ejemplo n.º 2
def parse_css(doc, url):
  Returns (modified_doc, new_urls), where new_urls are absolute URLs for
  all links found in the CSS.
    global config

    new_urls = []

    L = htmldata.urlextract(doc, url, "text/css")
    for item in L:
        # Store url locally.
        u = item.url

        if config.no_images and any(u.strip().lower().endswith(suffix) for suffix in (".jpg", ".gif", ".png", ".ico")):
            item.url = ""

        new_urls += [u]
        item.url = url_to_relative(u, url)

    newdoc = htmldata.urljoin(doc, L)
    newdoc = post_css_transform(newdoc, url)

    return (newdoc, new_urls)
Ejemplo n.º 3
def parse_css(doc, url):
    Returns (modified_doc, new_urls), where new_urls are absolute URLs for
    all links found in the CSS.
    global config

    new_urls = []

    L = htmldata.urlextract(doc, url, 'text/css')
    for item in L:
        # Store url locally.
        u = item.url

        if config.no_images and any(u.strip().lower().endswith(suffix) for suffix in ('.jpg', '.gif', '.png', '.ico')):
            item.url = ''

        new_urls += [u]
        item.url = url_to_relative(u, url)

    newdoc = htmldata.urljoin(doc, L)
    newdoc = post_css_transform(newdoc, url)

    return (newdoc, new_urls)
Ejemplo n.º 4
def parse_html(doc, url, config):
  Returns (modified_doc, new_urls), where new_urls are absolute URLs for
  all links we want to spider in the HTML.
  BEGIN_COMMENT_REPLACE = '<BEGINCOMMENT-' + str(random.random()) + '>'
  END_COMMENT_REPLACE   = '<ENDCOMMENT-' + str(random.random()) + '>'

  new_urls = []  

  # Temporarily "get rid" of comments so htmldata will find the URLs
  # in the funky "<!--[if" HTML hackery for IE.
  doc = doc.replace('<!--', BEGIN_COMMENT_REPLACE)
  doc = doc.replace('-->', END_COMMENT_REPLACE)

  L = htmldata.urlextract(doc, url, 'text/html')
  for item in L:
    u = item.url
    if should_follow(url, u):
      # Store url locally.
      new_urls += [u]
      item.url = url_to_relative(u, url, config)
      item.url = rewrite_external_url(item.url, config)

  newdoc = htmldata.urljoin(doc, L)
  newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--')
  newdoc = newdoc.replace(END_COMMENT_REPLACE, '-->')
  newdoc = post_html_transform(newdoc, url, config)

  return (newdoc, new_urls)
Ejemplo n.º 5
def parse_html(doc, url, filename):
    Returns (modified_doc, new_urls), where new_urls are absolute URLs for
    all links we want to spider in the HTML.
    global config
    global counter

    BEGIN_COMMENT_REPLACE = '<BEGINCOMMENT-' + str(random.random()) + '>'
    END_COMMENT_REPLACE = '<ENDCOMMENT-' + str(random.random()) + '>'

    new_urls = []

    doc = pre_html_transform(doc, url)
    # Temporarily "get rid" of comments so htmldata will find the URLs
    # in the funky "<!--[if" HTML hackery for IE.
    doc = doc.replace('<!--', BEGIN_COMMENT_REPLACE)
    doc = doc.replace('-->', END_COMMENT_REPLACE)

    L = htmldata.urlextract(doc, url, 'text/html')

    # in this code we change each absolute url in L
    # into a relative one.
    # we also kick-off zillions of subthreads to collect 
    # more pages.
    for item in L:
        u = item.url
        follow = should_follow(u) #and (counter < 10)
        if follow:
            if config.debug:
                print('ACCEPTED   - ', u)
            # Store url locally.
            new_urls += [u]
            item.url = url_to_relative(u, url)
            # James, let's keep everything by default (but not follow it).
            # if not any( license in u for license in ('creativecommons.org', 'wxwidgets.org', 'gnu.org', 'mediawiki.org') ):
            #  item.url = ''
            if config.debug:
                print('NOT INCLUDED     - ', u)

    newdoc = htmldata.urljoin(doc, L)
    newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--')
    newdoc = newdoc.replace(END_COMMENT_REPLACE, '-->')

    newdoc = pos_html_transform(newdoc, url,filename)

    # Remove byte artifacts in string
    newdoc = newdoc.replace('\\n','\n')
    newdoc = newdoc.replace('\\t', '\t')
    newdoc = newdoc.strip('b')
    newdoc = newdoc.strip('')

    return (newdoc, new_urls)
Ejemplo n.º 6
def parse_html(doc, url, filename):
    Returns (modified_doc, new_urls), where new_urls are absolute URLs for
    all links we want to spider in the HTML.
    global config
    global counter

    BEGIN_COMMENT_REPLACE = '<BEGINCOMMENT-' + str(random.random()) + '>'
    END_COMMENT_REPLACE = '<ENDCOMMENT-' + str(random.random()) + '>'

    new_urls = []

    doc = pre_html_transform(doc, url)
    # Temporarily "get rid" of comments so htmldata will find the URLs
    # in the funky "<!--[if" HTML hackery for IE.
    doc = doc.replace('<!--', BEGIN_COMMENT_REPLACE)
    doc = doc.replace('-->', END_COMMENT_REPLACE)

    L = htmldata.urlextract(doc, url, 'text/html')

    # in this code we change each absolute url in L
    # into a relative one.
    # we also kick-off zillions of subthreads to collect 
    # more pages.
    for item in L:
        u = item.url
        follow = should_follow(u) #and (counter < 10)
        if follow:
            if config.debug:
                print 'ACCEPTED   - ', u
            # Store url locally.
            new_urls += [u]
            item.url = url_to_relative(u, url)
            # James, let's keep everything by default (but not follow it).
            # if not any( license in u for license in ('creativecommons.org', 'wxwidgets.org', 'gnu.org', 'mediawiki.org') ):
            #  item.url = ''
            if config.debug:
                print 'NOT INCLUDED     - ', u

    newdoc = htmldata.urljoin(doc, L)
    newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--')
    newdoc = newdoc.replace(END_COMMENT_REPLACE, '-->')

    newdoc = pos_html_transform(newdoc, url,filename)

    return (newdoc, new_urls)
Ejemplo n.º 7
def parse_css(doc, url, config):
  Returns (modified_doc, new_urls), where new_urls are absolute URLs for
  all links found in the CSS.
    new_urls = []

    L = htmldata.urlextract(doc, url, 'text/css')
    for item in L:
        # Store url locally.
        u = item.url
        new_urls += [u]
        item.url = url_to_relative(u, url, config)

    newdoc = htmldata.urljoin(doc, L)
    newdoc = post_css_transform(newdoc, url, config)

    return (newdoc, new_urls)
Ejemplo n.º 8
def parse_css(doc, url, config):
  Returns (modified_doc, new_urls), where new_urls are absolute URLs for
  all links found in the CSS.
  new_urls = []  

  L = htmldata.urlextract(doc, url, 'text/css')
  for item in L:
    # Store url locally.
    u = item.url
    new_urls += [u]
    item.url = url_to_relative(u, url, config)

  newdoc = htmldata.urljoin(doc, L)
  newdoc = post_css_transform(newdoc, url, config)

  return (newdoc, new_urls)
Ejemplo n.º 9
def parse_html(doc, url):
  Returns (modified_doc, new_urls), where new_urls are absolute URLs for
  all links we want to spider in the HTML.
    global config

    BEGIN_COMMENT_REPLACE = "<BEGINCOMMENT-" + str(random.random()) + ">"
    END_COMMENT_REPLACE = "<ENDCOMMENT-" + str(random.random()) + ">"

    new_urls = []

    doc = pre_html_transform(doc, url)
    # Temporarily "get rid" of comments so htmldata will find the URLs
    # in the funky "<!--[if" HTML hackery for IE.
    doc = doc.replace("<!--", BEGIN_COMMENT_REPLACE)
    doc = doc.replace("-->", END_COMMENT_REPLACE)

    L = htmldata.urlextract(doc, url, "text/html")

    for item in L:
        u = item.url
        follow = should_follow(u)
        if follow:
            if config.debug:
                print "ACCEPTED   - ", u
            # Store url locally.
            new_urls += [u]
            item.url = url_to_relative(u, url)
            if not any(
                license in u for license in ("creativecommons.org", "wxwidgets.org", "gnu.org", "mediawiki.org")
                item.url = ""
            if config.debug:
                print "DENIED     - ", u

    newdoc = htmldata.urljoin(doc, L)
    newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, "<!--")
    newdoc = newdoc.replace(END_COMMENT_REPLACE, "-->")

    newdoc = pos_html_transform(newdoc, url)

    return (newdoc, new_urls)
Ejemplo n.º 10
def parse_html(doc, url):
  Returns (modified_doc, new_urls), where new_urls are absolute URLs for
  all links we want to spider in the HTML.
    global config

    BEGIN_COMMENT_REPLACE = '<BEGINCOMMENT-' + str(random.random()) + '>'
    END_COMMENT_REPLACE = '<ENDCOMMENT-' + str(random.random()) + '>'

    new_urls = []

    doc = pre_html_transform(doc, url)
    # Temporarily "get rid" of comments so htmldata will find the URLs
    # in the funky "<!--[if" HTML hackery for IE.
    doc = doc.replace('<!--', BEGIN_COMMENT_REPLACE)
    doc = doc.replace('-->', END_COMMENT_REPLACE)

    L = htmldata.urlextract(doc, url, 'text/html')

    for item in L:
        u = item.url
        follow = should_follow(u)
        if follow:
            if config.debug:
                print 'ACCEPTED   - ', u
            # Store url locally.
            new_urls += [u]
            item.url = url_to_relative(u, url)
            if not any(license in u
                       for license in ('creativecommons.org', 'wxwidgets.org',
                                       'gnu.org', 'mediawiki.org')):
                item.url = ''
            if config.debug:
                print 'DENIED     - ', u

    newdoc = htmldata.urljoin(doc, L)
    newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--')
    newdoc = newdoc.replace(END_COMMENT_REPLACE, '-->')

    newdoc = pos_html_transform(newdoc, url)

    return (newdoc, new_urls)