def parse_meizitu(url): response = requests.get(url=url, headers=get_headers()) soup = BeautifulSoup(response.content, "html.parser") jpg_tags = soup.find_all("img") for tag in jpg_tags: jpg_url = tag.get("src") save_img(jpg_url)
def save_img(url): dir_root = "G:\pircture\\" isExists = os.path.exists(dir_root) if not isExists: os.mkdir(dir_root) response = requests.get(url=url, headers=get_headers()) file_name = str(time.time()) + os.path.splitext(url)[1] with open(dir_root + file_name, "wb") as f: f.write(response.content)
def parse_url_to_html(url, file_name): try: count = 0 response = requests.get(url, headers=get_headers(), proxies=get_proxies()) while response.status_code != '200' and count < 5: response = requests.get(url, headers=get_headers(), proxies=get_proxies()) count = count + 1 if response.status_code != '200': print(response) return except Exception as e: print(e, " \n") return soup = BeautifulSoup(response.content, 'html.parser') body = soup.find_all(class_="x-wiki-content x-main-content")[0] html = str(body).encode('utf-8') with open(file_name, 'wb') as f: f.write(html)
def get_url_list(): """ 获取所有url目录 :return: """ baseUrl = 'https://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000' response = requests.get(baseUrl, headers=get_headers(), proxies=get_proxies()) soup = BeautifulSoup(response.content, 'html.parser') menu_tag = soup.find_all(class_='uk-nav uk-nav-side')[1] urls = [] for li in menu_tag.find_all('div'): url = "http://www.liaoxuefeng.com" + li.a.get('href') urls.append(url) return urls
def test_proxy(ip, type): if type == 'http': base_test_url = test_http_proxy_base_url elif type == 'https': base_test_url = test_https_proxy_base_url else: raise RuntimeError("参数错误") try: requests.get(url=base_test_url, headers=get_headers(), proxies={type: ip}, timeout=2) except: print('%s connection failed ...' % ip) return False else: print('%s connection success...' % ip) return True
def get_html_sp(url): resp = requests.get(url, headers=get_headers()) if resp: return BeautifulSoup(resp.content, "html.parser")
def get_response(url): return requests.get(url, headers=get_headers())