def catch_pic_diagrams(url): soup = coderpig.get_bs(coderpig.get_resp(url).decode('utf-8')) # 先拿标题建文件夹: article_header = soup.find('header', attrs={'class': 'article-header'}).find('a').get_text().replace(':', " ") save_path = pic_save_path + article_header + "/" coderpig.is_dir_existed(save_path) print("开始下载:" + article_header) # 拿图片url imgs = soup.find('article').findAll('img') for img in imgs[:-1]: coderpig.download_pic(img['src'].lstrip('/'), save_path)
def download_pic(pic_key, pic_dir): proxy_ip = t.get_proxy_ip() coderpig.is_dir_existed(pic_download_dir) url = img_start_url + pic_key + img_end resp = coderpig.get_resp(url, proxy=proxy_ip, headers=referrer_header) try: print("下载图片:" + url) pic_name = pic_key + ".jpg" with open(pic_dir + pic_name, "wb+") as f: f.write(resp) except (OSError, urllib.error.HTTPError, urllib.error.URLError, Exception) as reason: print(str(reason))
def download_pic(pic_key, pic_dir): proxy_ip = coderpig.get_proxy_ip() coderpig.is_dir_existed(pic_download_dir) url = img_start_url + pic_key + img_end resp = coderpig.get_resp(url, proxy=proxy_ip, headers=referrer_header) try: print("下载图片:" + url) pic_name = pic_key + ".jpg" with open(pic_dir + pic_name, "wb+") as f: f.write(resp) except (OSError, urllib.error.HTTPError, urllib.error.URLError, Exception) as reason: print(str(reason))
def catch_pic_diagrams(url, tag): soup = coderpig.get_bs(coderpig.get_resp(url).decode('utf-8')) title = soup.find('div', attrs={'class': 'ptitle'}).h1.get_text() pic_path = pic_save_path + tag + '/' + title + '/' coderpig.is_dir_existed(pic_path) ul = soup.find('ul', attrs={'class': 'scroll-img scroll-img02 clearfix'}) lis = ul.findAll('li') for li in lis: pic_soup = coderpig.get_bs(coderpig.get_resp(li.a['href']).decode('utf-8')) pic_div = pic_soup.find('div', attrs={'id': 'pic-meinv'}) pic_url = pic_div.find('img')['data-original'] proxy_ip = t.get_proxy_ip() coderpig.download_pic(pic_url, pic_path, proxy=proxy_ip)
def catch_pic_diagrams(url, tag): soup = coderpig.get_bs(coderpig.get_resp(url).decode('utf-8')) title = soup.find('div', attrs={'class': 'ptitle'}).h1.get_text() pic_path = pic_save_path + tag + '/' + title + '/' coderpig.is_dir_existed(pic_path) ul = soup.find('ul', attrs={'class': 'scroll-img scroll-img02 clearfix'}) lis = ul.findAll('li') for li in lis: pic_soup = coderpig.get_bs( coderpig.get_resp(li.a['href']).decode('utf-8')) pic_div = pic_soup.find('div', attrs={'id': 'pic-meinv'}) pic_url = pic_div.find('img')['data-original'] proxy_ip = t.get_proxy_ip() coderpig.download_pic(pic_url, pic_path, proxy=proxy_ip)
def catch_pic_diagrams(url): resp = coderpig.get_resp(url).decode('utf-8') soup = coderpig.get_bs(resp) dir_name = soup.find('title').get_text()[:-5] save_path = pic_save_path + dir_name + '/' coderpig.is_dir_existed(save_path) # 通过末页获取总共有多少页 page_count = int(moye_pattern.match(soup.find('a', text='末页')['href']).group(1)) for page in range(1, page_count + 1): page_resp = coderpig.get_resp(url.replace('.html', '_' + str(page) + '.html')).decode('utf-8') page_soup = coderpig.get_bs(page_resp) # 获取本页的图片 imgs = page_soup.find('p', attrs={'align': 'center'}).findAll('img') for img in imgs: coderpig.download_pic(img['src'], save_path)
# 获取网页里的图片url def fetch_pic(): browser = coderpig.init_browser() for i in range(1, max_page_count + 1): url = weibo_url + containerid + "&page=" + str(i) browser.get(url) print("开始解析 ====== 第%d页 ====== " % i) html_text = browser.page_source soup = coderpig.get_bs(html_text) data_json = soup.find('pre').get_text() data_dict = json.loads(data_json) cards = data_dict['data']['cards'] for card in cards: if 'mblog' in card: mblog = card['mblog'] if 'pics' in mblog: pics = mblog['pics'] for pic in pics: if 'large' in pic: pic_url = pic['large']['url'] coderpig.download_pic(pic['large']['url'], save_path) browser.close() if __name__ == '__main__': coderpig.init_https() coderpig.is_dir_existed(save_path) fetch_pic()
coderpig.init_https() # 不存在的话去拉一次 if not os.path.exists(board_ids_file): boards_id = catch_all_boards(base_url + user_id) while True: boards_id = catch_json_boards( boards_json_pattern.sub(str(boards_id), boards_model_url)) if boards_id is None: break # 画板一般不怎么变化,里面的图片变得比较频繁 if os.path.exists(pin_keys_file): os.remove(pin_keys_file) boards_list = coderpig.load_data(board_ids_file) for board in boards_list: pic_save_dir = pic_download_dir + board.split(':')[0] + "/" coderpig.is_dir_existed(pic_save_dir) board_id = board.split(':')[1] board_url = base_url + 'boards/' + board_id + '/' board_last_pin_id = get_boards_index_data(board_url, pic_save_dir) board_json_url = board_url + pins_model_url if board_last_pin_id is not None: while True: board_last_pin_id = get_json_list( pins_json_pattern.sub(str(board_last_pin_id), board_json_url), pic_save_dir) if board_last_pin_id is None: break pic_url_list = coderpig.load_data(pin_keys_file) for key in pic_url_list: download_pic(key.split(':')[1], key.split(':')[0])
# 抓取身份证上前六位对应的行政区划代码 import coderpig import re base_url = 'http://www.zxinc.org/gb2260-latest.htm' file_path = "output/id_card_area_code.txt" city_pattern = re.compile(r'^(\d{6})\s*(.*)$') def cat_code_list(): result_list = [] soup = coderpig.get_bs(coderpig.get_resp(base_url)) areacode = soup.find('areacode').get_text() city_list = areacode.split("\n") for i in city_list[2:]: result = city_pattern.match(i) if result is not None: result_list.append(result.group(1) + ":" + result.group(2)) return result_list if __name__ == '__main__': coderpig.is_dir_existed('output/') result_list = cat_code_list() if result_list is not None: coderpig.write_list_data(result_list, file_path) print("文件写入完毕!")
proxy_ip = coderpig.get_proxy_ip() soup = coderpig.get_bs( coderpig.get_resp(url, headers=headers, proxy=proxy_ip).decode('utf-8')) div = soup.find('div', attrs={'class': 'list_item_new'}) spans = div.findAll('span', attrs={'class': 'link_title'}) for span in spans: coderpig.write_str_data(base_url + span.find('a')['href'], articles_file) # 访问网页 def read_article_url(url): proxy_ip = coderpig.get_proxy_ip() resp = coderpig.get_resp(url, read=False, headers=headers, proxy=proxy_ip) if (resp is not None) and (resp.getcode() == 200): global read_count read_count += 1 print("累计访问成功次数: %d" % read_count) if __name__ == '__main__': coderpig.init_https() if not coderpig.is_dir_existed(articles_file, mkdir=False): count = int(get_page_count()) for i in range(1, count + 1): get_article_url(base_article_list + str(i)) url_list = coderpig.load_data(articles_file) while True: read_article_url(url_list[random.randint(0, len(url_list) - 1)])
if __name__ == '__main__': coderpig.init_https() # 不存在的话去拉一次 if not os.path.exists(board_ids_file): boards_id = catch_all_boards(base_url + user_id) while True: boards_id = catch_json_boards(boards_json_pattern.sub(str(boards_id), boards_model_url)) if boards_id is None: break # 画板一般不怎么变化,里面的图片变得比较频繁 if os.path.exists(pin_keys_file): os.remove(pin_keys_file) boards_list = coderpig.load_data(board_ids_file) for board in boards_list: pic_save_dir = pic_download_dir + board.split(':')[0] + "/" coderpig.is_dir_existed(pic_save_dir) board_id = board.split(':')[1] board_url = base_url + 'boards/' + board_id + '/' board_last_pin_id = get_boards_index_data(board_url, pic_save_dir) board_json_url = board_url + pins_model_url if board_last_pin_id is not None: while True: board_last_pin_id = get_json_list(pins_json_pattern.sub(str(board_last_pin_id), board_json_url), pic_save_dir) if board_last_pin_id is None: break pic_url_list = coderpig.load_data(pin_keys_file) for key in pic_url_list: download_pic(key.split(':')[1], key.split(':')[0])