def get_toppost100(params): while True: proxy_ip = cpn.get_proxy_ip() try: resp = requests.get(toppost100_url, params=params, headers=toppost100_headers, proxies=proxy_ip, timeout=5) if resp is not None: print("抓取:" + resp.request.url) soup = cpn.get_bs(resp.text) ul = soup.find( 'ul', attrs={ 'class': 'l-clearfix gridList workImageCards js-workTopList' }) lis = ul.findAll('li') for li in lis: img = li.find('img', attrs={'class': 'cardImage'})['src'][:-4] if not img == '': name = li.find('p', attrs={ 'class': 'fz14 text cut' }).get_text().strip() if name == '': name = str(int(time.time())) cpn.write_str_data(name + "Θ" + img, pic_urls_file) return None except Exception as e: print(threading.current_thread().name + "~" + str(e))
def get_ajax_data(data): while True: proxy_ip = cpn.get_proxy_ip() try: resp = requests.post(ajax_url, data=data, headers=ajax_headers, proxies=proxy_ip, timeout=5) if resp is not None: soup = cpn.get_bs(resp.text) lis = soup.findAll('li') for li in lis: img = li.find('img', attrs={'class': 'cardImage'})['src'][:-4] if not img == '': name = li.find('p', attrs={ 'class': 'fz14 text cut' }).get_text().strip() if name == '': name = str(int(time.time())) cpn.write_str_data(name + "Θ" + img, pic_urls_file) return None except Exception as e: print(threading.current_thread().name + "~" + str(e))
def get_page_count(): try: resp = requests.get(tiezi_url, headers=headers, timeout=5) if resp is not None: soup = cpn.get_bs(resp.text) a_s = soup.find("ul", attrs={'class': 'l_posts_num'}).findAll("a") for a in a_s: if a.get_text() == '尾页': return a['href'].split('=')[1] except Exception as e: print(str(e))
def get_page_count(): try: resp = requests.get(list_url, headers=headers, timeout=5) if resp is not None: soup = cpn.get_bs(resp.text) div = soup.find('div', attrs={'id': 'papelist'}) page_count = (div.findAll('a')[-1]['href']).split('/')[-1] print("解析获得文章页数:" + page_count) return page_count except Exception as e: print(str(e))
def catch_page_count(): proxy_ip = {'https': 'https://' + cpn.get_dx_proxy_ip()} try: resp = requests.get(base_url, headers=headers, proxies=proxy_ip) if resp is not None: soup = cpn.get_bs(resp.text) # 获得最后一页页码 last_page_count = soup.find('div', attrs={'class', 'pagination' }).findAll('a')[-2].get_text() return last_page_count except Exception as e: print(str(e))
def get_article_url(url): try: resp = requests.get(url, headers=headers, timeout=5) if resp is not None: print("解析:" + resp.request.url) soup = cpn.get_bs(resp.text) div = soup.find('div', attrs={'id': 'article_list'}) spans = div.findAll('span', attrs={'class': 'link_title'}) for span in spans: cpn.write_str_data(base_url + span.find('a')['href'], articles_file) return None except Exception as e: print(str(e))
def catch_page_count(): while True: proxy_ip = {'http': 'http://' + cpn.get_dx_proxy_ip()} try: resp = requests.get(base_url, headers=headers, proxies=proxy_ip, timeout=5) if resp is not None: print(proxy_ip) soup = cpn.get_bs(resp.text) # 获得最后一页页码 last_page_count = soup.find('div', attrs={'class', 'pagination'}).findAll('a')[-2].get_text() return last_page_count except Exception as e: pass
def catch_ip(url): proxy_ip = {'https': 'https://' + cpn.get_dx_proxy_ip()} try: resp = requests.get(url, headers=headers, proxies=proxy_ip) if resp is not None: soup = cpn.get_bs(resp.text) trs = soup.find('table').findAll('tr')[1:] for tr in trs: if float(tr.find('div', attrs={'bar'})['title'][:-1]) > 1: tds = tr.findAll('td') cpn.write_xc_ip_file(tds[1].get_text() + ":" + tds[2].get_text()) except Exception as e: print(str(e))
def catch_ip(url): while True: proxy_ip = {'http': 'http://' + cpn.get_dx_proxy_ip()} print(proxy_ip) try: resp = requests.get(url, headers=headers, proxies=proxy_ip,timeout=10) if resp is not None: soup = cpn.get_bs(resp.text) trs = soup.find('table').findAll('tr')[1:] for tr in trs: if float(tr.find('div', attrs={'bar'})['title'][:-1]) > 1: tds = tr.findAll('td') cpn.write_xc_ip_file(tds[1].get_text() + ":" + tds[2].get_text()) except Exception as e: pass
def get_pics(count): while True: params = {'pn': count, 'ajax': '1', 't': int(time.time())} try: resp = requests.get(tiezi_url, headers=headers, timeout=5, params=params) if resp is not None: soup = cpn.get_bs(resp.text) imgs = soup.findAll('img', attrs={'class': 'BDE_Image'}) for img in imgs: cpn.write_str_data(img['src'], pic_urls_file) return None except Exception as e: pass pass
def get_pics(count): while True: params = { 'pn': count, 'ajax': '1', 't': int(time.time()) } try: resp = requests.get(tiezi_url, headers=headers, timeout=5, params=params) if resp is not None: soup = cpn.get_bs(resp.text) imgs = soup.findAll('img', attrs={'class': 'BDE_Image'}) for img in imgs: cpn.write_str_data(img['src'], pic_urls_file) return None except Exception as e: pass pass
def get_ajax_data(data): while True: proxy_ip = cpn.get_proxy_ip() try: resp = requests.post(ajax_url, data=data, headers=ajax_headers, proxies=proxy_ip, timeout=5) if resp is not None: soup = cpn.get_bs(resp.text) lis = soup.findAll('li') for li in lis: img = li.find('img', attrs={'class': 'cardImage'})['src'][:-4] if not img == '': name = li.find('p', attrs={'class': 'fz14 text cut'}).get_text().strip() if name == '': name = str(int(time.time())) cpn.write_str_data(name + "Θ" + img, pic_urls_file) return None except Exception as e: print(threading.current_thread().name + "~" + str(e))
def get_toppost100(params): while True: proxy_ip = cpn.get_proxy_ip() try: resp = requests.get(toppost100_url, params=params, headers=toppost100_headers, proxies=proxy_ip, timeout=5) if resp is not None: print("抓取:" + resp.request.url) soup = cpn.get_bs(resp.text) ul = soup.find('ul', attrs={'class': 'l-clearfix gridList workImageCards js-workTopList'}) lis = ul.findAll('li') for li in lis: img = li.find('img', attrs={'class': 'cardImage'})['src'][:-4] if not img == '': name = li.find('p', attrs={'class': 'fz14 text cut'}).get_text().strip() if name == '': name = str(int(time.time())) cpn.write_str_data(name + "Θ" + img, pic_urls_file) return None except Exception as e: print(threading.current_thread().name + "~" + str(e))