Esempio n. 1
0
def visit(url):
    if url.startswith(base_url) == False:
        return

    try:
        resp = urlopen(url)
    except URLError as e:
        return

    page = resp.read()
    cleaner = Cleaner()
    cleaner.javasript = True
    cleaner.style = True
    cleaner.kill_tags = ELEMENTS_TO_IGNORE

    # soup = BeautifulSoup(page, "lxml")
    # for link in soup.findAll('a'):
    # 	if link.has_attr('href'):
    # 		if link.has_attr('class') and 'history' in link['class']:
    # 			continue
    # 		next_link = urljoin(url,link['href'])
    # 		next_link = urldefrag(next_link)[0]
    # 		if next_link not in visited_pages:
    # 			visited_pages.append(next_link)
    # 			pages_to_visit.append(next_link)
    f = open("testing.txt", 'w')
    f.write(page)

    clean_page = cleaner.clean_html(page)
    f.write("\n\n\nVS\n\n\n")
    f.write(clean_page)
    f.close()
    soup = BeautifulSoup(clean_page, "lxml")
    return
    extract(soup, url)
Esempio n. 2
0
def visit(url):
	if url.startswith(base_url) == False:
		return

	try:
		resp = urlopen(url)
	except URLError as e:
		return

	page = resp.read()
	cleaner = Cleaner()
	cleaner.javasript = True
	cleaner.style = True
	cleaner.kill_tags = ELEMENTS_TO_IGNORE

	# soup = BeautifulSoup(page, "lxml")
	# for link in soup.findAll('a'):
	# 	if link.has_attr('href'):
	# 		if link.has_attr('class') and 'history' in link['class']:
	# 			continue
	# 		next_link = urljoin(url,link['href'])
	# 		next_link = urldefrag(next_link)[0]
	# 		if next_link not in visited_pages:
	# 			visited_pages.append(next_link)
	# 			pages_to_visit.append(next_link)

	clean_page = cleaner.clean_html(page)
	soup = BeautifulSoup(clean_page, "lxml")
	extract(soup, url)