コード例 #1
0
def catch_pic_diagrams(url):
    soup = coderpig.get_bs(coderpig.get_resp(url).decode('utf-8'))
    # 先拿标题建文件夹:
    article_header = soup.find('header', attrs={'class': 'article-header'}).find('a').get_text().replace(':', " ")
    save_path = pic_save_path + article_header + "/"
    coderpig.is_dir_existed(save_path)
    print("开始下载:" + article_header)
    # 拿图片url
    imgs = soup.find('article').findAll('img')
    for img in imgs[:-1]:
        coderpig.download_pic(img['src'].lstrip('/'), save_path)
コード例 #2
0
def download_pic(pic_key, pic_dir):
    proxy_ip = t.get_proxy_ip()
    coderpig.is_dir_existed(pic_download_dir)
    url = img_start_url + pic_key + img_end
    resp = coderpig.get_resp(url, proxy=proxy_ip, headers=referrer_header)
    try:
        print("下载图片:" + url)
        pic_name = pic_key + ".jpg"
        with open(pic_dir + pic_name, "wb+") as f:
            f.write(resp)
    except (OSError, urllib.error.HTTPError, urllib.error.URLError, Exception) as reason:
        print(str(reason))
コード例 #3
0
def download_pic(pic_key, pic_dir):
    proxy_ip = coderpig.get_proxy_ip()
    coderpig.is_dir_existed(pic_download_dir)
    url = img_start_url + pic_key + img_end
    resp = coderpig.get_resp(url, proxy=proxy_ip, headers=referrer_header)
    try:
        print("下载图片:" + url)
        pic_name = pic_key + ".jpg"
        with open(pic_dir + pic_name, "wb+") as f:
            f.write(resp)
    except (OSError, urllib.error.HTTPError, urllib.error.URLError,
            Exception) as reason:
        print(str(reason))
コード例 #4
0
def catch_pic_diagrams(url, tag):
    soup = coderpig.get_bs(coderpig.get_resp(url).decode('utf-8'))
    title = soup.find('div', attrs={'class': 'ptitle'}).h1.get_text()
    pic_path = pic_save_path + tag + '/' + title + '/'
    coderpig.is_dir_existed(pic_path)
    ul = soup.find('ul', attrs={'class': 'scroll-img scroll-img02 clearfix'})
    lis = ul.findAll('li')
    for li in lis:
        pic_soup = coderpig.get_bs(coderpig.get_resp(li.a['href']).decode('utf-8'))
        pic_div = pic_soup.find('div', attrs={'id': 'pic-meinv'})
        pic_url = pic_div.find('img')['data-original']
        proxy_ip = t.get_proxy_ip()
        coderpig.download_pic(pic_url, pic_path, proxy=proxy_ip)
コード例 #5
0
def catch_pic_diagrams(url, tag):
    soup = coderpig.get_bs(coderpig.get_resp(url).decode('utf-8'))
    title = soup.find('div', attrs={'class': 'ptitle'}).h1.get_text()
    pic_path = pic_save_path + tag + '/' + title + '/'
    coderpig.is_dir_existed(pic_path)
    ul = soup.find('ul', attrs={'class': 'scroll-img scroll-img02 clearfix'})
    lis = ul.findAll('li')
    for li in lis:
        pic_soup = coderpig.get_bs(
            coderpig.get_resp(li.a['href']).decode('utf-8'))
        pic_div = pic_soup.find('div', attrs={'id': 'pic-meinv'})
        pic_url = pic_div.find('img')['data-original']
        proxy_ip = t.get_proxy_ip()
        coderpig.download_pic(pic_url, pic_path, proxy=proxy_ip)
コード例 #6
0
def catch_pic_diagrams(url):
    resp = coderpig.get_resp(url).decode('utf-8')
    soup = coderpig.get_bs(resp)
    dir_name = soup.find('title').get_text()[:-5]
    save_path = pic_save_path + dir_name + '/'
    coderpig.is_dir_existed(save_path)
    # 通过末页获取总共有多少页
    page_count = int(moye_pattern.match(soup.find('a', text='末页')['href']).group(1))
    for page in range(1, page_count + 1):
        page_resp = coderpig.get_resp(url.replace('.html', '_' + str(page) + '.html')).decode('utf-8')
        page_soup = coderpig.get_bs(page_resp)
        # 获取本页的图片
        imgs = page_soup.find('p', attrs={'align': 'center'}).findAll('img')
        for img in imgs:
            coderpig.download_pic(img['src'], save_path)
コード例 #7
0
def catch_pic_diagrams(url):
    resp = coderpig.get_resp(url).decode('utf-8')
    soup = coderpig.get_bs(resp)
    dir_name = soup.find('title').get_text()[:-5]
    save_path = pic_save_path + dir_name + '/'
    coderpig.is_dir_existed(save_path)
    # 通过末页获取总共有多少页
    page_count = int(moye_pattern.match(soup.find('a', text='末页')['href']).group(1))
    for page in range(1, page_count + 1):
        page_resp = coderpig.get_resp(url.replace('.html', '_' + str(page) + '.html')).decode('utf-8')
        page_soup = coderpig.get_bs(page_resp)
        # 获取本页的图片
        imgs = page_soup.find('p', attrs={'align': 'center'}).findAll('img')
        for img in imgs:
            coderpig.download_pic(img['src'], save_path)
コード例 #8
0
# 获取网页里的图片url
def fetch_pic():
    browser = coderpig.init_browser()
    for i in range(1, max_page_count + 1):
        url = weibo_url + containerid + "&page=" + str(i)
        browser.get(url)
        print("开始解析 ====== 第%d页 ====== " % i)
        html_text = browser.page_source
        soup = coderpig.get_bs(html_text)
        data_json = soup.find('pre').get_text()
        data_dict = json.loads(data_json)
        cards = data_dict['data']['cards']
        for card in cards:
            if 'mblog' in card:
                mblog = card['mblog']
                if 'pics' in mblog:
                    pics = mblog['pics']
                    for pic in pics:
                        if 'large' in pic:
                            pic_url = pic['large']['url']
                            coderpig.download_pic(pic['large']['url'],
                                                  save_path)
    browser.close()


if __name__ == '__main__':
    coderpig.init_https()
    coderpig.is_dir_existed(save_path)
    fetch_pic()
コード例 #9
0
    coderpig.init_https()
    # 不存在的话去拉一次
    if not os.path.exists(board_ids_file):
        boards_id = catch_all_boards(base_url + user_id)
        while True:
            boards_id = catch_json_boards(
                boards_json_pattern.sub(str(boards_id), boards_model_url))
            if boards_id is None:
                break
    # 画板一般不怎么变化,里面的图片变得比较频繁
    if os.path.exists(pin_keys_file):
        os.remove(pin_keys_file)
    boards_list = coderpig.load_data(board_ids_file)
    for board in boards_list:
        pic_save_dir = pic_download_dir + board.split(':')[0] + "/"
        coderpig.is_dir_existed(pic_save_dir)
        board_id = board.split(':')[1]
        board_url = base_url + 'boards/' + board_id + '/'
        board_last_pin_id = get_boards_index_data(board_url, pic_save_dir)
        board_json_url = board_url + pins_model_url
        if board_last_pin_id is not None:
            while True:
                board_last_pin_id = get_json_list(
                    pins_json_pattern.sub(str(board_last_pin_id),
                                          board_json_url), pic_save_dir)
                if board_last_pin_id is None:
                    break
    pic_url_list = coderpig.load_data(pin_keys_file)
    for key in pic_url_list:
        download_pic(key.split(':')[1], key.split(':')[0])
コード例 #10
0
# 抓取身份证上前六位对应的行政区划代码

import coderpig
import re

base_url = 'http://www.zxinc.org/gb2260-latest.htm'
file_path = "output/id_card_area_code.txt"
city_pattern = re.compile(r'^(\d{6})\s*(.*)$')


def cat_code_list():
    result_list = []
    soup = coderpig.get_bs(coderpig.get_resp(base_url))
    areacode = soup.find('areacode').get_text()
    city_list = areacode.split("\n")
    for i in city_list[2:]:
        result = city_pattern.match(i)
        if result is not None:
            result_list.append(result.group(1) + ":" + result.group(2))
    return result_list


if __name__ == '__main__':
    coderpig.is_dir_existed('output/')
    result_list = cat_code_list()
    if result_list is not None:
        coderpig.write_list_data(result_list, file_path)
    print("文件写入完毕!")
コード例 #11
0
    proxy_ip = coderpig.get_proxy_ip()
    soup = coderpig.get_bs(
        coderpig.get_resp(url, headers=headers,
                          proxy=proxy_ip).decode('utf-8'))
    div = soup.find('div', attrs={'class': 'list_item_new'})
    spans = div.findAll('span', attrs={'class': 'link_title'})
    for span in spans:
        coderpig.write_str_data(base_url + span.find('a')['href'],
                                articles_file)


# 访问网页
def read_article_url(url):
    proxy_ip = coderpig.get_proxy_ip()
    resp = coderpig.get_resp(url, read=False, headers=headers, proxy=proxy_ip)
    if (resp is not None) and (resp.getcode() == 200):
        global read_count
        read_count += 1
        print("累计访问成功次数: %d" % read_count)


if __name__ == '__main__':
    coderpig.init_https()
    if not coderpig.is_dir_existed(articles_file, mkdir=False):
        count = int(get_page_count())
        for i in range(1, count + 1):
            get_article_url(base_article_list + str(i))
    url_list = coderpig.load_data(articles_file)
    while True:
        read_article_url(url_list[random.randint(0, len(url_list) - 1)])
コード例 #12
0

# 获取网页里的图片url
def fetch_pic():
    browser = coderpig.init_browser()
    for i in range(1, max_page_count + 1):
        url = weibo_url + containerid + "&page=" + str(i)
        browser.get(url)
        print("开始解析 ====== 第%d页 ====== " % i)
        html_text = browser.page_source
        soup = coderpig.get_bs(html_text)
        data_json = soup.find('pre').get_text()
        data_dict = json.loads(data_json)
        cards = data_dict['data']['cards']
        for card in cards:
            if 'mblog' in card:
                mblog = card['mblog']
                if 'pics' in mblog:
                    pics = mblog['pics']
                    for pic in pics:
                        if 'large' in pic:
                            pic_url = pic['large']['url']
                            coderpig.download_pic(pic['large']['url'], save_path)
    browser.close()


if __name__ == '__main__':
    coderpig.init_https()
    coderpig.is_dir_existed(save_path)
    fetch_pic()
コード例 #13
0
if __name__ == '__main__':
    coderpig.init_https()
    # 不存在的话去拉一次
    if not os.path.exists(board_ids_file):
        boards_id = catch_all_boards(base_url + user_id)
        while True:
            boards_id = catch_json_boards(boards_json_pattern.sub(str(boards_id), boards_model_url))
            if boards_id is None:
                break
    # 画板一般不怎么变化,里面的图片变得比较频繁
    if os.path.exists(pin_keys_file):
        os.remove(pin_keys_file)
    boards_list = coderpig.load_data(board_ids_file)
    for board in boards_list:
        pic_save_dir = pic_download_dir + board.split(':')[0] + "/"
        coderpig.is_dir_existed(pic_save_dir)
        board_id = board.split(':')[1]
        board_url = base_url + 'boards/' + board_id + '/'
        board_last_pin_id = get_boards_index_data(board_url, pic_save_dir)
        board_json_url = board_url + pins_model_url
        if board_last_pin_id is not None:
            while True:
                board_last_pin_id = get_json_list(pins_json_pattern.sub(str(board_last_pin_id), board_json_url),
                                                  pic_save_dir)
                if board_last_pin_id is None:
                    break
    pic_url_list = coderpig.load_data(pin_keys_file)
    for key in pic_url_list:
        download_pic(key.split(':')[1], key.split(':')[0])