def gather_links(page_url): html_string = '' try: response = urlopen(page_url) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode('utf-8') finder = Finder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(str(e)) return set() return finder.page_links()
def gather_link(page_url): html_string = '' req = Request(page_url, headers={'User-Agent': 'Mozilla/5.0'}) context = ssl._create_unverified_context() response = urlopen(req, context=context) charset = str(response.getheader('Content-Type')).split('charset=')[1] try: if 'text/html;' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode('utf-8') parser = TheHTMLParse(html_string, Spider.project_name, page_url) finder = Finder(Spider.base_url, page_url) finder.feed(str(html_string)) except: logthis("Spider. Sorry sir i can't crawl this page ...", Spider.project_name) return set() return finder.page_links()
def gather_links(page_url): #tambah disini try: finder = Finder(page_url) #tambah disini except Exception as e: return set() return finder.page_links() #ERROR DISINI