def collect_links_and_data(self, page_url): # Fixes ssl issue for some mac users if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)): ssl._create_default_https_context = ssl._create_unverified_context try: html_string = "" response = urlopen(page_url) if "text/html" in response.getheader( "Content-Type"): # Check to see if HTML response html_bytes = response.read() # Read the bytestream in response html_string = html_bytes.decode( "utf-8") # Decode bytestream as utf-8 parser = WebParser( self.base_url ) # Initialise custom webparser with html response parser.feed(html_string) # Execute parser self.data_list = parser.get_data_with_tags( ) # Retrieve datalist from parser except Exception as e: print("Error: " + str(e)) print("Program will terminate") sys.exit() return parser.get_page_urls()
def collect_links_and_data(self, page_url): try: html_string = "" response = urlopen(page_url) if "text/html" in response.getheader( "Content-Type"): # Check to see if HTML response html_bytes = response.read() # Read the bytestream in response html_string = html_bytes.decode( "utf-8") # Decode bytestream as utf-8 parser = WebParser( self.base_url ) # Initialise custom webparser with html response parser.feed(html_string) # Execute parser self.data_list = parser.get_data_with_tags( ) # Retrieve datalist from parser except Exception as e: print("Error: " + str(e)) print("Program will terminate") sys.exit() return parser.get_page_urls()