def catch_pic_diagrams(url, tag): soup = coderpig.get_bs(coderpig.get_resp(url).decode('utf-8')) title = soup.find('div', attrs={'class': 'ptitle'}).h1.get_text() pic_path = pic_save_path + tag + '/' + title + '/' coderpig.is_dir_existed(pic_path) ul = soup.find('ul', attrs={'class': 'scroll-img scroll-img02 clearfix'}) lis = ul.findAll('li') for li in lis: pic_soup = coderpig.get_bs(coderpig.get_resp(li.a['href']).decode('utf-8')) pic_div = pic_soup.find('div', attrs={'id': 'pic-meinv'}) pic_url = pic_div.find('img')['data-original'] proxy_ip = t.get_proxy_ip() coderpig.download_pic(pic_url, pic_path, proxy=proxy_ip)
def catch_pic_diagrams(url, tag): soup = coderpig.get_bs(coderpig.get_resp(url).decode('utf-8')) title = soup.find('div', attrs={'class': 'ptitle'}).h1.get_text() pic_path = pic_save_path + tag + '/' + title + '/' coderpig.is_dir_existed(pic_path) ul = soup.find('ul', attrs={'class': 'scroll-img scroll-img02 clearfix'}) lis = ul.findAll('li') for li in lis: pic_soup = coderpig.get_bs( coderpig.get_resp(li.a['href']).decode('utf-8')) pic_div = pic_soup.find('div', attrs={'id': 'pic-meinv'}) pic_url = pic_div.find('img')['data-original'] proxy_ip = t.get_proxy_ip() coderpig.download_pic(pic_url, pic_path, proxy=proxy_ip)
def catch_pic_diagrams(url): resp = coderpig.get_resp(url).decode('utf-8') soup = coderpig.get_bs(resp) dir_name = soup.find('title').get_text()[:-5] save_path = pic_save_path + dir_name + '/' coderpig.is_dir_existed(save_path) # 通过末页获取总共有多少页 page_count = int(moye_pattern.match(soup.find('a', text='末页')['href']).group(1)) for page in range(1, page_count + 1): page_resp = coderpig.get_resp(url.replace('.html', '_' + str(page) + '.html')).decode('utf-8') page_soup = coderpig.get_bs(page_resp) # 获取本页的图片 imgs = page_soup.find('p', attrs={'align': 'center'}).findAll('img') for img in imgs: coderpig.download_pic(img['src'], save_path)
def catch_pic_diagrams(url): resp = coderpig.get_resp(url).decode('utf-8') soup = coderpig.get_bs(resp) dir_name = soup.find('title').get_text()[:-5] save_path = pic_save_path + dir_name + '/' coderpig.is_dir_existed(save_path) # 通过末页获取总共有多少页 page_count = int(moye_pattern.match(soup.find('a', text='末页')['href']).group(1)) for page in range(1, page_count + 1): page_resp = coderpig.get_resp(url.replace('.html', '_' + str(page) + '.html')).decode('utf-8') page_soup = coderpig.get_bs(page_resp) # 获取本页的图片 imgs = page_soup.find('p', attrs={'align': 'center'}).findAll('img') for img in imgs: coderpig.download_pic(img['src'], save_path)
def catch_pic_diagrams_url(url): url_list = [] soup = coderpig.get_bs(coderpig.get_resp(url).decode('utf-8')) articles = soup.findAll('article', attrs={'class': 'excerpt'}) for article in articles: url_list.append(article.a['href']) return url_list
def get_page_count(): proxy_ip = coderpig.get_proxy_ip() soup = coderpig.get_bs( coderpig.get_resp(content_url, headers=headers, proxy=proxy_ip).decode('utf-8')) div = soup.find('div', attrs={'id': 'papelist'}) page_count = (div.findAll('a')[-1]['href']).split('/')[-1] return page_count
def get_city_list_url(): city_list_url = [] weather_hb_soup = coderpig.get_bs(coderpig.get_resp(weather_hb_url).decode('utf-8')) weather_box = weather_hb_soup.find(attrs={'class': 'lqcontentBoxheader'}) weather_a_list = weather_box.findAll('a') for i in weather_a_list: city_list_url.append(weather_base_url + i['href']) return city_list_url
def catch_pic_diagrams_url(url): url_list = [] soup = coderpig.get_bs(coderpig.get_resp(url)) div = soup.find('div', attrs={'taotu-main'}) lis = div.findAll('li') for li in lis: if li._class != 'longword': url_list.append((base_url + li.find('a')['href'])) return url_list
def get_pic_set_page(url): url_list = [] proxy_ip = t.get_proxy_ip() soup = coderpig.get_bs(coderpig.get_resp(url, proxy=proxy_ip)) divs = soup.find('div', attrs={'class', 'pages'}) a_s = divs.findAll('a', attrs={'class', 'num'}) for a in a_s: url_list.append(a['href']) return url_list
def catch_pic_diagrams_url(url): url_list = [] soup = coderpig.get_bs(coderpig.get_resp(url)) div = soup.find('div', attrs={'taotu-main'}) lis = div.findAll('li') for li in lis: if li._class != 'longword': url_list.append((base_url + li.find('a')['href'])) return url_list
def get_pic_set(url): url_list = [] proxy_ip = coderpig.get_proxy_ip() soup = coderpig.get_bs(coderpig.get_resp(url, proxy=proxy_ip)) divs = soup.findAll('div', attrs={'class', 'tab_tj'}) a_s = divs[1].findAll('a') for a in a_s: url_list.append(a['href']) return url_list
def get_pic_set_page(url): url_list = [] proxy_ip = t.get_proxy_ip() soup = coderpig.get_bs(coderpig.get_resp(url, proxy=proxy_ip)) divs = soup.find('div', attrs={'class', 'pages'}) a_s = divs.findAll('a', attrs={'class', 'num'}) for a in a_s: url_list.append(a['href']) return url_list
def get_tag_url(): print("================================================== 检测有效的tag页:\n") for i in range(2, 101): proxy_ip = t.get_proxy_ip() tag_url = host_url + '/meinvtag' + str(i) + '_1.html' resp = coderpig.get_resp(tag_url, proxy=proxy_ip, read=False) if resp is not None: if resp.getcode() == 200: soup = coderpig.get_bs(resp.read()) coderpig.write_str_data(soup.find('h2').get_text() + "-" + tag_url, tag_url_file)
def cat_code_list(): result_list = [] soup = coderpig.get_bs(coderpig.get_resp(base_url)) areacode = soup.find('areacode').get_text() city_list = areacode.split("\n") for i in city_list[2:]: result = city_pattern.match(i) if result is not None: result_list.append(result.group(1) + ":" + result.group(2)) return result_list
def get_article_url(url): proxy_ip = coderpig.get_proxy_ip() soup = coderpig.get_bs( coderpig.get_resp(url, headers=headers, proxy=proxy_ip).decode('utf-8')) div = soup.find('div', attrs={'class': 'list_item_new'}) spans = div.findAll('span', attrs={'class': 'link_title'}) for span in spans: coderpig.write_str_data(base_url + span.find('a')['href'], articles_file)
def catch_pic_diagrams(url): soup = coderpig.get_bs(coderpig.get_resp(url).decode('utf-8')) # 先拿标题建文件夹: article_header = soup.find('header', attrs={'class': 'article-header'}).find('a').get_text().replace(':', " ") save_path = pic_save_path + article_header + "/" coderpig.is_dir_existed(save_path) print("开始下载:" + article_header) # 拿图片url imgs = soup.find('article').findAll('img') for img in imgs[:-1]: coderpig.download_pic(img['src'].lstrip('/'), save_path)
def get_tag_url(): print("================================================== 检测有效的tag页:\n") for i in range(2, 101): proxy_ip = t.get_proxy_ip() tag_url = host_url + '/meinvtag' + str(i) + '_1.html' resp = coderpig.get_resp(tag_url, proxy=proxy_ip, read=False) if resp is not None: if resp.getcode() == 200: soup = coderpig.get_bs(resp.read()) coderpig.write_str_data( soup.find('h2').get_text() + "-" + tag_url, tag_url_file)
def get_city_code(city_list_url): city_code_dict = {} # 创建一个空字典 city_pattern = re.compile(r'^<a.*?weather/(.*?).s.*</a>$') # 获取城市编码的正则 weather_hb_soup = coderpig.get_bs(coderpig.get_resp(city_list_url).decode('utf-8')) # 需要过滤一波无效的 div_conMidtab = weather_hb_soup.find_all(attrs={'class': 'conMidtab', 'style': ''}) for mid in div_conMidtab: tab3 = mid.find_all(attrs={'class': 'conMidtab3'}) for tab in tab3: trs = tab.findAll('tr') for tr in trs: a_list = tr.findAll('a') for a in a_list: if a.get_text() != "详情": # 正则拿到城市编码 city_code = city_pattern.match(str(a)).group(1) city_name = a.string city_code_dict[city_code] = city_name return city_code_dict
def fetch_pic(): browser = coderpig.init_browser() for i in range(1, max_page_count + 1): url = weibo_url + containerid + "&page=" + str(i) browser.get(url) print("开始解析 ====== 第%d页 ====== " % i) html_text = browser.page_source soup = coderpig.get_bs(html_text) data_json = soup.find('pre').get_text() data_dict = json.loads(data_json) cards = data_dict['data']['cards'] for card in cards: if 'mblog' in card: mblog = card['mblog'] if 'pics' in mblog: pics = mblog['pics'] for pic in pics: if 'large' in pic: pic_url = pic['large']['url'] coderpig.download_pic(pic['large']['url'], save_path) browser.close()
def get_meizi_url(html): soup = coderpig.get_bs(html) ol = soup.find('ol', attrs={'class': 'commentlist'}) href = ol.findAll('a', attrs={'class': 'view_img_link'}) for a in href: download_pic(a['href'])
def get_page_count(html): soup = coderpig.get_bs(html) page_count = soup.find('span', attrs={'class': 'current-comment-page'}) return int(page_count.get_text()[1:-1]) - 1