def parse_html(page_url): html_string = '' try: response = urlopen(page_url, timeout=5) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = HtmlParser(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(str(e)) return set(), html_string return finder.page_links(), html_string
def parse_html(self): """ parse_html - Parse the html content """ try: parser = HtmlParser(self.url) parser.set_pattern(self.pattern) parser.set_urls(self.spider_config) parser.set_next_depth(self.depth) parser.feed(self.page) parser.close() except UnicodeDecodeError as e: logging.error('Thread:{} parse {} failed, msg:{}'.format( self.thread_id, self.url, e)) return False return True