def test_copy_preserves_encoding(self): soup = BeautifulSoup(b'<p> </p>', 'html.parser') encoding = soup.original_encoding copy = soup.__copy__() assert "<p> </p>" == str(copy) assert encoding == copy.original_encoding
url = 'https://zh.moegirl.org/%E7%99%BD%E5%AD%A6' url = quote( url, safe=string.printable) # this will helps solving the chinese url problem # create a http reader http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) # request response = http.request( 'get', url ) # this will cost some time for opening the pages (put this inside a thread worker) if response.status == 200: # succeed status:200 print('Oh,yeah!') pass # create a beautiful soup object bsObj = BeautifulSoup(response.data, "html.parser") d = bsObj.__copy__() parseURL = parse.urlparse(url) # get internal links (starts with / or contains the same net location) currentURL = parseURL.scheme + '://' + parseURL.netloc + parseURL.path links = bsObj.findAll("a", href=re.compile("^(/|.*" + currentURL + ")")) # write them internalLinks = [] for link in links: if link.attrs['href'] is not None and link.attrs[ 'href'] not in internalLinks: if link.attrs['href'].startswith("/"): internalLinks.append(currentURL + link.attrs['href']) else: internalLinks.append(link.attrs['href']) print('len(internalLinks):' + str(len(internalLinks))) # save only the content