Esempio n. 1
0
File: web.py Progetto: viperfx/GSOC
def get_page():
    # request the page
    r = requests.get(request.args['url'])
    # parse the dom into python objects
    html = lxml.html.document_fromstring(r.content)
    # prase the requested url so we can form the base href
    url = urlparse(request.args['url'])
    # create the base url dom fragment
    base_url = lxml.html.fromstring("<base href='%s://%s'>" % (url.scheme, url.hostname)).find('.//base')
    # find the head element
    head = html.find(".//head")
    # insert the base href in the last place of the head elements
    head.insert(-1, base_url)
    # rewrite urls to have absolute url
    html.resolve_base_href()
    # rewrite links to load through this proxy
    for element, attribute, link, pos in html.iterlinks():
        if element.tag == "a" and attribute == "href":
            link = "http://localhost:8888/translate_url?url=%s" % (link)
            element.set("href", link)
            element.set("target", "_parent")
    # translate through DOM Traversal
    # html = translate_dom_string(html, lxml.html.tostring(html))
    # translate through HTML regex string replacement
    html = translate_html(html, lxml.html.tostring(html))
    # dump the html string for debugging
    # with open('html_dump', 'w') as f:
    #     f.write(lxml.html.tostring(html))
    # a little regex to remove any script tags
    return re.subn(r'<(script).*?</\1>(?s)', '', lxml.html.tostring(html))[0]
Esempio n. 2
0
 def fetch_links_from_web_page(self, page):
     log.debug('')
     try:
         # [ NOTE ]: Pull out all links after resolving them using any
         #           <base> tags found in the document.
         links = [
             link for element, attribute, link, pos in iterlinks(
                 resolve_base_href(page.content))
         ]
     except etree.ParseError:
         # [ NOTE ]: If the document is not HTML content this will return
         #           an empty list.
         links = []
     return list(set(links))
Esempio n. 3
0
def crawl(url, thread_id=0):
    global WORDS, OVERRIDE_SIZE, HEADER, SAVE_PAGES, SAVE_WORDS
    if not OVERRIDE_SIZE:
        try:
            # Attempt to get the size in bytes of the document
            length = int(
                requests.head(url, headers=HEADER).headers['Content-Length'])
        except KeyError:  # Sometimes no Content-Length header is returned...
            length = 1
        if length > 524288000:  # If the page is larger than 500 MB
            raise SizeError
    # If the SizeError is raised it will be caught in the except block in the run section,
    # and the following code will not be run.
    page = requests.get(url, headers=HEADER)  # Get page
    word_list = []
    doctype = get_mime_type(page)
    if doctype.find('image') < 0 and doctype.find('video') < 0:
        if SAVE_WORDS:
            word_list = make_words(page)
            for word in word_list:
                WORDS.put(word)
        try:
            # Pull out all links after resolving them using any <base> tags found in the document.
            links = [
                link for element, attribute, link, pos in iterlinks(
                    resolve_base_href(make_links_absolute(page.content, url)))
            ]
        except etree.ParseError:
            # If the document is not HTML content this will return an empty list.
            links = []
        links = list(set(links))
    else:
        links = []
    if SAVE_PAGES:
        save_page(url, page)
    if SAVE_WORDS:
        # Announce which link was crawled
        write_log('CRAWL',
                  'Found {0} links and {1} words on {2}'.format(
                      len(links), len(word_list), url),
                  worker=thread_id)
    else:
        # Announce which link was crawled
        write_log('CRAWL',
                  'Found {0} links on {1}'.format(len(links), url),
                  worker=thread_id)
    return links
Esempio n. 4
0
def crawl(url):
    global TODO
    if not OVERRIDE_SIZE:
        try:
            # Attempt to get the size in bytes of the document
            length = int(
                requests.head(url, headers=HEADER).headers['Content-Length'])
        except KeyError:  # Sometimes no Content-Length header is returned...
            length = 1
        if length > 524288000:  # If the page is larger than 500 MB
            raise SizeError
    # If the SizeError is raised it will be caught in the except block in the run section,
    # and the following code will not be run.
    page = requests.get(url, headers=HEADER)  # Get page
    word_list = []
    if SAVE_WORDS:
        word_list = make_words(page)
        WORDS.update(word_list)
    try:
        # Pull out all links after resolving them using any <base> tags found in the document.
        links = [
            link for element, attribute, link, pos in iterlinks(
                resolve_base_href(page.content))
        ]
    except etree.ParseError:
        # If the document is not HTML content this will return an empty list.
        links = []
    links = list(set(links))
    TODO += links
    DONE.append(url)
    if SAVE_PAGES:
        save_page(url, page)
    if SAVE_WORDS:
        # Announce which link was crawled
        write_log('[CRAWL]: Found {0} links and {1} words on {2}'.format(
            len(word_list), len(links), url))
    else:
        # Announce which link was crawled
        write_log('[CRAWL]: Found {0} links on {1}'.format(len(links), url))
Esempio n. 5
0
########
VI = j['GroupItem'][1]  # VI
print(VI['GroupItemName'])
########
DIAG = j['GroupItem'][2] # диагностика
print(DIAG['GroupItemName'])
########
DRVR = j['GroupItem'][3] # звук
print(DRVR['GroupItemName'])
########
UTIL = j['GroupItem'][10] # Съемные накопители
print(UTIL['GroupItemName'])
# for x in iter_:
#     print(x.get
for x in range(11):
    print(j['GroupItem'][x]['GroupItemName'])
links = html.resolve_base_href(r.text)
# http://downloads.dell.com/comm/R85670.EXE  'R80894.EXE'  "input#tagDrivers")
for x in iter_:
    try:
        print(x[0].attrib['href'])  # , "----", x[1], "---", x[2]
        if 'http://downloads' in x[0].attrib['href']:  # 'http://downloads.dell.com/comm/R85670.EXE':
            print()
    except:
        pass
print()
###################################################################