def catch_pic_diagrams(url): soup = coderpig.get_bs(coderpig.get_resp(url).decode('utf-8')) # 先拿标题建文件夹: article_header = soup.find('header', attrs={'class': 'article-header'}).find('a').get_text().replace(':', " ") save_path = pic_save_path + article_header + "/" coderpig.is_dir_existed(save_path) print("开始下载:" + article_header) # 拿图片url imgs = soup.find('article').findAll('img') for img in imgs[:-1]: coderpig.download_pic(img['src'].lstrip('/'), save_path)
def fetch_json(url): data = str(coderpig.get_resp(url).decode('utf-8')) data = json.loads(data) result_list = data['postList'] for result in result_list: save_path = pic_save_path + result['post_id'] + '/' if not os.path.exists(save_path): os.makedirs(save_path) pic_list = get_pic_url_list(result['url']) for pic in pic_list: coderpig.download_pic(pic, save_path)
def catch_pic_diagrams(url, tag): soup = coderpig.get_bs(coderpig.get_resp(url).decode('utf-8')) title = soup.find('div', attrs={'class': 'ptitle'}).h1.get_text() pic_path = pic_save_path + tag + '/' + title + '/' coderpig.is_dir_existed(pic_path) ul = soup.find('ul', attrs={'class': 'scroll-img scroll-img02 clearfix'}) lis = ul.findAll('li') for li in lis: pic_soup = coderpig.get_bs(coderpig.get_resp(li.a['href']).decode('utf-8')) pic_div = pic_soup.find('div', attrs={'id': 'pic-meinv'}) pic_url = pic_div.find('img')['data-original'] proxy_ip = t.get_proxy_ip() coderpig.download_pic(pic_url, pic_path, proxy=proxy_ip)
def catch_pic_diagrams(url, tag): soup = coderpig.get_bs(coderpig.get_resp(url).decode('utf-8')) title = soup.find('div', attrs={'class': 'ptitle'}).h1.get_text() pic_path = pic_save_path + tag + '/' + title + '/' coderpig.is_dir_existed(pic_path) ul = soup.find('ul', attrs={'class': 'scroll-img scroll-img02 clearfix'}) lis = ul.findAll('li') for li in lis: pic_soup = coderpig.get_bs( coderpig.get_resp(li.a['href']).decode('utf-8')) pic_div = pic_soup.find('div', attrs={'id': 'pic-meinv'}) pic_url = pic_div.find('img')['data-original'] proxy_ip = t.get_proxy_ip() coderpig.download_pic(pic_url, pic_path, proxy=proxy_ip)
def catch_pic_diagrams(url): resp = coderpig.get_resp(url).decode('utf-8') soup = coderpig.get_bs(resp) dir_name = soup.find('title').get_text()[:-5] save_path = pic_save_path + dir_name + '/' coderpig.is_dir_existed(save_path) # 通过末页获取总共有多少页 page_count = int(moye_pattern.match(soup.find('a', text='末页')['href']).group(1)) for page in range(1, page_count + 1): page_resp = coderpig.get_resp(url.replace('.html', '_' + str(page) + '.html')).decode('utf-8') page_soup = coderpig.get_bs(page_resp) # 获取本页的图片 imgs = page_soup.find('p', attrs={'align': 'center'}).findAll('img') for img in imgs: coderpig.download_pic(img['src'], save_path)
def fetch_pic(): browser = coderpig.init_browser() for i in range(1, max_page_count + 1): url = weibo_url + containerid + "&page=" + str(i) browser.get(url) print("开始解析 ====== 第%d页 ====== " % i) html_text = browser.page_source soup = coderpig.get_bs(html_text) data_json = soup.find('pre').get_text() data_dict = json.loads(data_json) cards = data_dict['data']['cards'] for card in cards: if 'mblog' in card: mblog = card['mblog'] if 'pics' in mblog: pics = mblog['pics'] for pic in pics: if 'large' in pic: pic_url = pic['large']['url'] coderpig.download_pic(pic['large']['url'], save_path) browser.close()
def fetch_meizi_pic(url): data = str(coderpig.get_resp(url).decode('utf-8')) data = json.loads(data) result_list = data['results'] for result in result_list: coderpig.download_pic(result['url'], pic_save_path)