def get_fzrb(): paper_name = '法制日报' today_str = datetime.strftime(datetime.now(), '%Y%m%d') paper_dir = os.path.dirname(__file__) + os.sep + \ 'papers' + os.sep + today_str+'/'+paper_name+'/' os.makedirs(paper_dir, exist_ok=True) url = 'http://epaper.legaldaily.com.cn/fzrb/content/PaperIndex.htm' soup = getSoup(url) real_url = re.findall('URL=(.*?)"', str(soup.select('meta')[3]))[0] url = urljoin(url, real_url) soup = getSoup(url) url_page1 = urljoin( url, re.findall('URL=(.*?)"', str(soup.select('meta')[3]))[0]) soup = getSoup(url_page1) # 各版面pdf地址列表 layout_pdf_trs = soup.select( 'body>table>tr:nth-of-type(2)>td:nth-of-type(1)>table>tr:nth-of-type(1)>td:nth-of-type(2)>table:nth-of-type(3) tr' ) for tr in layout_pdf_trs: print('-' * 50) a = tr.select('tr td:nth-of-type(3) a')[0].attrs['href'] pdf_url = urljoin(url, a) print(paper_name + ' - ' + pdf_url) downwithbar(pdf_url, paper_dir) print('-' * 50)
def get_kjrb(): paper_name = '科技日报' today_str = datetime.strftime(datetime.now(), '%Y%m%d') paper_dir = os.path.dirname(__file__) + os.sep + \ 'papers' + os.sep + today_str+'/'+paper_name+'/' os.makedirs(paper_dir, exist_ok=True) url = 'http://digitalpaper.stdaily.com/' soup = getSoup(url) url_1 = urljoin(url, re.findall('URL=(.*?)"', str(soup.select('meta')[3]))[0]) soup = getSoup(url_1) url_2 = urljoin(url_1, re.findall('URL=(.*?)"', str(soup.select('meta')[3]))[0]) soup = getSoup(url_2) layouts = soup.select('div.bmname a') for layout in layouts: layout_url = urljoin(url_2, layout.attrs['href']) layout_soup = getSoup(layout_url) a = layout_soup.select('.pdf a')[0].attrs['href'] pdf_url = urljoin(layout_url, a) print(paper_name + ' - ' + pdf_url) downwithbar(pdf_url, paper_dir) print('-' * 50)
def get_xxsb(): paper_name = '学习时报' today_str = datetime.strftime(datetime.now(), '%Y%m%d') paper_dir = os.path.dirname(__file__) + os.sep + \ 'papers' + os.sep + today_str+'/'+paper_name+'/' os.makedirs(paper_dir, exist_ok=True) url = 'http://paper.cntheory.com' soup = getSoup(url) url_1 = urljoin(url, re.findall('URL=(.*?)"', str(soup.select('meta')[3]))[0]) soup = getSoup(url_1) layouts = soup.select('.right_title-pdf a') for i in range(len(layouts)): layout_url = urljoin(url_1, layouts[i].attrs['href']) pdf_url = layout_url print(paper_name + ' - ' + pdf_url) i = i + 1 downwithbar(pdf_url, paper_dir, str(i) if i > 9 else '0' + str(i)) print('-' * 50)
def get_grrb(): paper_name = '工人日报' today_str = datetime.strftime(datetime.now(), '%Y%m%d') paper_dir = os.path.dirname(__file__) + os.sep + \ 'papers' + os.sep + today_str+'/'+paper_name+'/' os.makedirs(paper_dir, exist_ok=True) paper_api_home = 'http://i.workercn.cn/paper' paper_date = json.loads( requests.get(paper_api_home + '/grrb/index.json?t=' + str(random.random())).text)['index'] dates = str.split(paper_date, '-') paperUrl = paper_api_home + "/" + "grrb" + "/" + \ dates[0] + "/" + dates[1] + "/" + dates[2] + ".json" paper_data = json.loads(requests.get(paperUrl).text) for i in range(1, len(paper_data['pages']) + 1): pdf_url = paper_data['paperHomeUrl'] + "/" + \ str(i) + "/grrb" + dates[0] + dates[1] + dates[2] + str(i) + ".pdf" print(paper_name + ' - ' + pdf_url) downwithbar(pdf_url, paper_dir) print('-' * 50)
def get_zggfb(): paper_name = '中国国防报' today_str = datetime.strftime(datetime.now(), '%Y%m%d') paper_dir = os.path.dirname(__file__) + os.sep + \ 'papers' + os.sep + today_str+'/'+paper_name+'/' os.makedirs(paper_dir, exist_ok=True) url = 'http://www.81.cn:80/gfbmap/paperindex.htm' soup = getSoup(url) real_url = re.findall('URL=(.*?)"', str(soup.select('meta')[3]))[0] url = urljoin(url, real_url) soup = getSoup(url) layouts = soup.select('#APP-SectionNav li a') for layout in layouts: layout_url = urljoin(url, layout.attrs['href']) layout_soup = getSoup(layout_url) pdf_url_temp = layout_soup.select('#APP-Pdf')[0].attrs['href'] pdf_url = urljoin(layout_url, pdf_url_temp) print(paper_name + ' - ' + pdf_url) downwithbar(pdf_url, paper_dir) print('-' * 50)
def get_xhmrdx(): paper_name = '新华每日电讯' today_str = datetime.strftime(datetime.now(), '%Y%m%d') paper_dir = os.path.dirname(__file__) + os.sep + \ 'papers' + os.sep + today_str+'/'+paper_name+'/' os.makedirs(paper_dir, exist_ok=True) url = 'http://mrdx.cn/content/PaperIndex.htm' soup = getSoup(url) real_url = re.findall('URL=(.*?)"', str(soup.select('meta')[3]))[0] url = urljoin(url, real_url) soup = getSoup(url) real_url_2 = re.findall('URL=(.*?)"', str(soup.select('meta')[3]))[0] url = urljoin(url, real_url_2) soup = getSoup(url) layouts = soup.select('.pdf') for layout in layouts: pdf_url = urljoin(url, layout.attrs['href']) print(paper_name + ' - ' + pdf_url) downwithbar(pdf_url, paper_dir) print('-' * 50)
def get_rmrb(): paper_name = '人民日报' today_str = datetime.strftime(datetime.now(), '%Y%m%d') paper_dir = os.path.dirname(__file__) + os.sep + \ 'papers' + os.sep + today_str+'/'+paper_name+'/' os.makedirs(paper_dir, exist_ok=True) url = 'http://paper.people.com.cn/rmrb/paperindex.htm' soup = getSoup(url) real_url = re.findall('URL=(.*?)"', str(soup.select('meta')[3]))[0] url = urljoin(url, real_url) soup = getSoup(url) layouts = soup.select('#pageLink') for layout in layouts: time.sleep(1) layout_url = urljoin(url, layout.attrs['href']) layout_soup = getSoup(layout_url) pdf_url_temp = layout_soup.select('.paper-bot .btn a')[0].attrs['href'] pdf_url = urljoin(layout_url, pdf_url_temp) print(paper_name + ' - ' + pdf_url) downwithbar(pdf_url, paper_dir) print('-' * 50)