def gatherLinks(pageURL): html_string = '' try: response = urlopen(pageURL) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode("utf-8") finderObject = Finder(Spider.baseURL, pageURL) finderObject.feed(html_string) except Exception as e: print(str(e)) return set() return finderObject.pageLink()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode('utf-8') finder = Finder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(str(e)) return set() return finder.page_links()
def gather_link(page_url): html_string = '' req = Request(page_url, headers={'User-Agent': 'Mozilla/5.0'}) context = ssl._create_unverified_context() response = urlopen(req, context=context) charset = str(response.getheader('Content-Type')).split('charset=')[1] try: if 'text/html;' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode('utf-8') parser = TheHTMLParse(html_string, Spider.project_name, page_url) finder = Finder(Spider.base_url, page_url) finder.feed(str(html_string)) except: logthis("Spider. Sorry sir i can't crawl this page ...", Spider.project_name) return set() return finder.page_links()
def crawl(url): if not url in spider.links_crawled: req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) with urlopen(req) as urldata: data = urldata.read() f = Finder() f.baseurl=spider.website_url data = data.decode('utf-8') data = html.unescape(data) f.feed(data) f.close() links=f.return_links() spider.links_website.remove(url) for val in links: spider.links_website.append(val) spider.links_crawled.append(url) spider.data_dict[url] = f.return_data()