def download_pic(img): img_url = img.split('Θ')[-1] pic_name = img.split('Θ')[0] + '.' + img_url.split('.')[-1] while True: proxy_ip = { 'http': 'http://' + cpn.get_dx_proxy_ip(), 'https': 'https://' + cpn.get_dx_proxy_ip() } try: resp = requests.get(img_url, headers=pic_headers, proxies=proxy_ip, timeout=5) if resp is not None: print("下载:" + resp.request.url) with open(pic_save_dir + pic_name, "wb+") as f: f.write(resp.content) return None except Exception as e: pass
def get_ajax_data(data): while True: proxy_ip = { 'http': 'http://' + cpn.get_dx_proxy_ip(), 'https': 'https://' + cpn.get_dx_proxy_ip() } try: resp = requests.post(ajax_url, data=data, headers=ajax_headers, proxies=proxy_ip, timeout=5) if resp is not None: soup = cpn.get_bs(resp.text) lis = soup.findAll('li') for li in lis: img = li.find('img', attrs={'class': 'cardImage'})['src'][:-4] if not img == '': name = li.find('p', attrs={'class': 'fz14 text cut'}).get_text().strip() if name == '': name = str(int(time.time())) cpn.write_str_data(name + "Θ" + img, pic_urls_file) return None except Exception as e: print(threading.current_thread().name + "~" + str(e))
def catch_page_count(): proxy_ip = {'https': 'https://' + cpn.get_dx_proxy_ip()} try: resp = requests.get(base_url, headers=headers, proxies=proxy_ip) if resp is not None: soup = cpn.get_bs(resp.text) # 获得最后一页页码 last_page_count = soup.find('div', attrs={'class', 'pagination' }).findAll('a')[-2].get_text() return last_page_count except Exception as e: print(str(e))
def catch_page_count(): while True: proxy_ip = {'http': 'http://' + cpn.get_dx_proxy_ip()} try: resp = requests.get(base_url, headers=headers, proxies=proxy_ip, timeout=5) if resp is not None: print(proxy_ip) soup = cpn.get_bs(resp.text) # 获得最后一页页码 last_page_count = soup.find('div', attrs={'class', 'pagination'}).findAll('a')[-2].get_text() return last_page_count except Exception as e: pass
def get_toppost100(params): while True: proxy_ip = { 'http': 'http://' + cpn.get_dx_proxy_ip(), 'https': 'https://' + cpn.get_dx_proxy_ip() } try: resp = requests.get(toppost100_url, params=params, headers=toppost100_headers, proxies=proxy_ip, timeout=5) if resp is not None: print("抓取:" + resp.request.url) soup = cpn.get_bs(resp.text) ul = soup.find('ul', attrs={'class': 'l-clearfix gridList workImageCards js-workTopList'}) lis = ul.findAll('li') for li in lis: img = li.find('img', attrs={'class': 'cardImage'})['src'][:-4] if not img == '': name = li.find('p', attrs={'class': 'fz14 text cut'}).get_text().strip() if name == '': name = str(int(time.time())) cpn.write_str_data(name + "Θ" + img, pic_urls_file) return None except Exception as e: print(threading.current_thread().name + "~" + str(e))
def catch_ip(url): proxy_ip = {'https': 'https://' + cpn.get_dx_proxy_ip()} try: resp = requests.get(url, headers=headers, proxies=proxy_ip) if resp is not None: soup = cpn.get_bs(resp.text) trs = soup.find('table').findAll('tr')[1:] for tr in trs: if float(tr.find('div', attrs={'bar'})['title'][:-1]) > 1: tds = tr.findAll('td') cpn.write_xc_ip_file(tds[1].get_text() + ":" + tds[2].get_text()) except Exception as e: print(str(e))
def catch_ip(url): while True: proxy_ip = {'http': 'http://' + cpn.get_dx_proxy_ip()} print(proxy_ip) try: resp = requests.get(url, headers=headers, proxies=proxy_ip,timeout=10) if resp is not None: soup = cpn.get_bs(resp.text) trs = soup.find('table').findAll('tr')[1:] for tr in trs: if float(tr.find('div', attrs={'bar'})['title'][:-1]) > 1: tds = tr.findAll('td') cpn.write_xc_ip_file(tds[1].get_text() + ":" + tds[2].get_text()) except Exception as e: pass