def visit(url): if url.startswith(base_url) == False: return try: resp = urlopen(url) except URLError as e: return page = resp.read() cleaner = Cleaner() cleaner.javasript = True cleaner.style = True cleaner.kill_tags = ELEMENTS_TO_IGNORE # soup = BeautifulSoup(page, "lxml") # for link in soup.findAll('a'): # if link.has_attr('href'): # if link.has_attr('class') and 'history' in link['class']: # continue # next_link = urljoin(url,link['href']) # next_link = urldefrag(next_link)[0] # if next_link not in visited_pages: # visited_pages.append(next_link) # pages_to_visit.append(next_link) f = open("testing.txt", 'w') f.write(page) clean_page = cleaner.clean_html(page) f.write("\n\n\nVS\n\n\n") f.write(clean_page) f.close() soup = BeautifulSoup(clean_page, "lxml") return extract(soup, url)
def visit(url): if url.startswith(base_url) == False: return try: resp = urlopen(url) except URLError as e: return page = resp.read() cleaner = Cleaner() cleaner.javasript = True cleaner.style = True cleaner.kill_tags = ELEMENTS_TO_IGNORE # soup = BeautifulSoup(page, "lxml") # for link in soup.findAll('a'): # if link.has_attr('href'): # if link.has_attr('class') and 'history' in link['class']: # continue # next_link = urljoin(url,link['href']) # next_link = urldefrag(next_link)[0] # if next_link not in visited_pages: # visited_pages.append(next_link) # pages_to_visit.append(next_link) clean_page = cleaner.clean_html(page) soup = BeautifulSoup(clean_page, "lxml") extract(soup, url)