Example #1
0
    def collect_links_and_data(self, page_url):

        # Fixes ssl issue for some mac users
        if (not os.environ.get('PYTHONHTTPSVERIFY', '')
                and getattr(ssl, '_create_unverified_context', None)):
            ssl._create_default_https_context = ssl._create_unverified_context
        try:
            html_string = ""
            response = urlopen(page_url)
            if "text/html" in response.getheader(
                    "Content-Type"):  # Check to see if HTML response
                html_bytes = response.read()  # Read the bytestream in response
                html_string = html_bytes.decode(
                    "utf-8")  # Decode bytestream as utf-8

            parser = WebParser(
                self.base_url
            )  # Initialise custom webparser with html response
            parser.feed(html_string)  # Execute parser
            self.data_list = parser.get_data_with_tags(
            )  # Retrieve datalist from parser
        except Exception as e:
            print("Error: " + str(e))
            print("Program will terminate")
            sys.exit()
        return parser.get_page_urls()
    def collect_links_and_data(self, page_url):
        try:
            html_string = ""
            response = urlopen(page_url)
            if "text/html" in response.getheader(
                    "Content-Type"):  # Check to see if HTML response
                html_bytes = response.read()  # Read the bytestream in response
                html_string = html_bytes.decode(
                    "utf-8")  # Decode bytestream as utf-8

            parser = WebParser(
                self.base_url
            )  # Initialise custom webparser with html response
            parser.feed(html_string)  # Execute parser
            self.data_list = parser.get_data_with_tags(
            )  # Retrieve datalist from parser
        except Exception as e:
            print("Error: " + str(e))
            print("Program will terminate")
            sys.exit()
        return parser.get_page_urls()