コード例 #1
0
ファイル: spiders.py プロジェクト: captainmoore/getpaper
def get_fzrb():
    paper_name = '法制日报'
    today_str = datetime.strftime(datetime.now(), '%Y%m%d')
    paper_dir = os.path.dirname(__file__) + os.sep + \
        'papers' + os.sep + today_str+'/'+paper_name+'/'
    os.makedirs(paper_dir, exist_ok=True)
    url = 'http://epaper.legaldaily.com.cn/fzrb/content/PaperIndex.htm'
    soup = getSoup(url)

    real_url = re.findall('URL=(.*?)"', str(soup.select('meta')[3]))[0]
    url = urljoin(url, real_url)
    soup = getSoup(url)
    url_page1 = urljoin(
        url,
        re.findall('URL=(.*?)"', str(soup.select('meta')[3]))[0])
    soup = getSoup(url_page1)

    # 各版面pdf地址列表
    layout_pdf_trs = soup.select(
        'body>table>tr:nth-of-type(2)>td:nth-of-type(1)>table>tr:nth-of-type(1)>td:nth-of-type(2)>table:nth-of-type(3) tr'
    )

    for tr in layout_pdf_trs:
        print('-' * 50)
        a = tr.select('tr td:nth-of-type(3) a')[0].attrs['href']
        pdf_url = urljoin(url, a)
        print(paper_name + ' - ' + pdf_url)
        downwithbar(pdf_url, paper_dir)
        print('-' * 50)
コード例 #2
0
ファイル: spiders.py プロジェクト: captainmoore/getpaper
def get_kjrb():
    paper_name = '科技日报'
    today_str = datetime.strftime(datetime.now(), '%Y%m%d')
    paper_dir = os.path.dirname(__file__) + os.sep + \
        'papers' + os.sep + today_str+'/'+paper_name+'/'
    os.makedirs(paper_dir, exist_ok=True)
    url = 'http://digitalpaper.stdaily.com/'

    soup = getSoup(url)
    url_1 = urljoin(url,
                    re.findall('URL=(.*?)"', str(soup.select('meta')[3]))[0])
    soup = getSoup(url_1)
    url_2 = urljoin(url_1,
                    re.findall('URL=(.*?)"', str(soup.select('meta')[3]))[0])
    soup = getSoup(url_2)

    layouts = soup.select('div.bmname a')

    for layout in layouts:
        layout_url = urljoin(url_2, layout.attrs['href'])
        layout_soup = getSoup(layout_url)
        a = layout_soup.select('.pdf a')[0].attrs['href']
        pdf_url = urljoin(layout_url, a)
        print(paper_name + ' - ' + pdf_url)
        downwithbar(pdf_url, paper_dir)
        print('-' * 50)
コード例 #3
0
ファイル: spiders.py プロジェクト: captainmoore/getpaper
def get_xxsb():
    paper_name = '学习时报'
    today_str = datetime.strftime(datetime.now(), '%Y%m%d')
    paper_dir = os.path.dirname(__file__) + os.sep + \
        'papers' + os.sep + today_str+'/'+paper_name+'/'
    os.makedirs(paper_dir, exist_ok=True)
    url = 'http://paper.cntheory.com'

    soup = getSoup(url)
    url_1 = urljoin(url,
                    re.findall('URL=(.*?)"', str(soup.select('meta')[3]))[0])
    soup = getSoup(url_1)
    layouts = soup.select('.right_title-pdf a')

    for i in range(len(layouts)):
        layout_url = urljoin(url_1, layouts[i].attrs['href'])
        pdf_url = layout_url
        print(paper_name + ' - ' + pdf_url)
        i = i + 1
        downwithbar(pdf_url, paper_dir, str(i) if i > 9 else '0' + str(i))
        print('-' * 50)
コード例 #4
0
ファイル: spiders.py プロジェクト: captainmoore/getpaper
def get_grrb():
    paper_name = '工人日报'
    today_str = datetime.strftime(datetime.now(), '%Y%m%d')
    paper_dir = os.path.dirname(__file__) + os.sep + \
        'papers' + os.sep + today_str+'/'+paper_name+'/'
    os.makedirs(paper_dir, exist_ok=True)

    paper_api_home = 'http://i.workercn.cn/paper'
    paper_date = json.loads(
        requests.get(paper_api_home + '/grrb/index.json?t=' +
                     str(random.random())).text)['index']
    dates = str.split(paper_date, '-')
    paperUrl = paper_api_home + "/" + "grrb" + "/" + \
        dates[0] + "/" + dates[1] + "/" + dates[2] + ".json"
    paper_data = json.loads(requests.get(paperUrl).text)

    for i in range(1, len(paper_data['pages']) + 1):
        pdf_url = paper_data['paperHomeUrl'] + "/" + \
            str(i) + "/grrb" + dates[0] + dates[1] + dates[2] + str(i) + ".pdf"
        print(paper_name + ' - ' + pdf_url)
        downwithbar(pdf_url, paper_dir)
        print('-' * 50)
コード例 #5
0
ファイル: spiders.py プロジェクト: captainmoore/getpaper
def get_zggfb():
    paper_name = '中国国防报'
    today_str = datetime.strftime(datetime.now(), '%Y%m%d')
    paper_dir = os.path.dirname(__file__) + os.sep + \
        'papers' + os.sep + today_str+'/'+paper_name+'/'
    os.makedirs(paper_dir, exist_ok=True)
    url = 'http://www.81.cn:80/gfbmap/paperindex.htm'

    soup = getSoup(url)
    real_url = re.findall('URL=(.*?)"', str(soup.select('meta')[3]))[0]
    url = urljoin(url, real_url)
    soup = getSoup(url)

    layouts = soup.select('#APP-SectionNav li a')
    for layout in layouts:
        layout_url = urljoin(url, layout.attrs['href'])
        layout_soup = getSoup(layout_url)
        pdf_url_temp = layout_soup.select('#APP-Pdf')[0].attrs['href']
        pdf_url = urljoin(layout_url, pdf_url_temp)
        print(paper_name + ' - ' + pdf_url)
        downwithbar(pdf_url, paper_dir)
        print('-' * 50)
コード例 #6
0
ファイル: spiders.py プロジェクト: captainmoore/getpaper
def get_xhmrdx():
    paper_name = '新华每日电讯'
    today_str = datetime.strftime(datetime.now(), '%Y%m%d')
    paper_dir = os.path.dirname(__file__) + os.sep + \
        'papers' + os.sep + today_str+'/'+paper_name+'/'
    os.makedirs(paper_dir, exist_ok=True)

    url = 'http://mrdx.cn/content/PaperIndex.htm'
    soup = getSoup(url)
    real_url = re.findall('URL=(.*?)"', str(soup.select('meta')[3]))[0]
    url = urljoin(url, real_url)
    soup = getSoup(url)
    real_url_2 = re.findall('URL=(.*?)"', str(soup.select('meta')[3]))[0]
    url = urljoin(url, real_url_2)
    soup = getSoup(url)

    layouts = soup.select('.pdf')
    for layout in layouts:
        pdf_url = urljoin(url, layout.attrs['href'])
        print(paper_name + ' - ' + pdf_url)
        downwithbar(pdf_url, paper_dir)
        print('-' * 50)
コード例 #7
0
ファイル: spiders.py プロジェクト: captainmoore/getpaper
def get_rmrb():
    paper_name = '人民日报'
    today_str = datetime.strftime(datetime.now(), '%Y%m%d')
    paper_dir = os.path.dirname(__file__) + os.sep + \
        'papers' + os.sep + today_str+'/'+paper_name+'/'
    os.makedirs(paper_dir, exist_ok=True)

    url = 'http://paper.people.com.cn/rmrb/paperindex.htm'
    soup = getSoup(url)
    real_url = re.findall('URL=(.*?)"', str(soup.select('meta')[3]))[0]
    url = urljoin(url, real_url)
    soup = getSoup(url)

    layouts = soup.select('#pageLink')
    for layout in layouts:
        time.sleep(1)
        layout_url = urljoin(url, layout.attrs['href'])
        layout_soup = getSoup(layout_url)

        pdf_url_temp = layout_soup.select('.paper-bot .btn a')[0].attrs['href']
        pdf_url = urljoin(layout_url, pdf_url_temp)
        print(paper_name + ' - ' + pdf_url)
        downwithbar(pdf_url, paper_dir)
        print('-' * 50)