def main(): for i in range(32, 35): print(i) if i == 0: url = "http://yjj.hebei.gov.cn/CL0214/index.html" else: url = "http://yjj.hebei.gov.cn/CL0214/index_" + str(i) + ".html" try: html = get(url, code="gb2312") except: html = get(url, code="GB18030") parse_index(html) for i in range(0, 2): print(i) if i == 0: url = "http://yjj.hebei.gov.cn/CL0215/index.html" else: url = "http://yjj.hebei.gov.cn/CL0215/index_" + str(i) + ".html" html = get(url, code="gb2312") parse_index(html) for i in range(0, 1): print(i) if i == 0: url = "http://yjj.hebei.gov.cn/CL0434/" html = get(url, code="gb2312") parse_index(html)
def main(): for i in range(0, 1): print(i) url = "http://mzj.beijing.gov.cn/col/col6112/index.html" html = get(url) parse_index(html) for i in range(0, 1): print(i) url = "http://mzj.beijing.gov.cn/col/col4492/index.html" html = get(url) parse_index(html)
def main(): for i in range(0, 1): print(i) url = "http://jyt.hlj.gov.cn/zwgk/ghjh/fzgh/" html = get(url, code="gbk") parse_index(html, url_preifx="http://jyt.hlj.gov.cn/zwgk/ghjh/fzgh") for i in range(0, 1): print(i) url = "http://jyt.hlj.gov.cn/zwgk/ghjh/gzyd/" html = get(url, code="gbk") parse_index(html, url_preifx="http://jyt.hlj.gov.cn/zwgk/ghjh/gzyd")
def main(): for i in range(16, 89): print(i) url = "http://www.cbirc.gov.cn/cbircweb/DocInfo/SelectDocByItemIdAndChild?itemId=928&pageSize=18&pageIndex=" + str( i) html = get(url) parse_index(html) url = "http://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=927,pageIndex=1,pageSize=18.json" html = get(url) parse_index(html)
def main(): url="http://amr.hlj.gov.cn/index/affairlist.html?" for i in range(1,18): print(i) i=2 params={'tid': '2', 'page': str(i)} html=get(url,params=params) parse_index(html) for i in range(1, 11): print(i) params = {'tid': '3', 'page': str(i)} html = get(url,params=params) parse_index(html)
def main(): for i in range(0, 1): print(i) if i == 0: url = "http://snamr.snaic.gov.cn/xw/szfwj.htm" html = get(url) parse_index(html)
def main(): url="http://www.xjzj.gov.cn/info/iList.jsp?" for i in range(1,3): print(i) params={'node_id': 'GKscjdj', 'site_id': 'CMSxjamr', 'cat_id': '10042', 'cur_page': str(i)} html=get(url,params=params) parse_index(html)
def main(): for i in range(1, 10): print(i) url = "http://www.yp.yn.gov.cn/newsite/NewsList.aspx?CID=b819ebae-9992-4331-bf4b-50974ce07b12&page=" + str( i) html = get(url) parse_index(html)
def main(): for i in range(1, 13): print(i) url = "http://gat.gxzf.gov.cn/n895440/n895445/n895495/index_898744_" + str( i) + ".html" html = get(url) parse_index(html)
def main(): for i in range(1, 24): print(i) url = "http://mzt.shaanxi.gov.cn/zwgk_list.rt?channlCid=0&channlId=41&pageNo=" + str( i) html = get(url) parse_index(html)
def main(): for i in range(1, 3): print(i) url = "http://yunnan.chinatax.gov.cn/col/col3851/index.html?uid=5306&pageNum=" + str( i) html = get(url) parse_index(html)
def main(): for i in range(1, 678): print(i) url = "http://jyt.fujian.gov.cn/was5/web/search?channelid=203116&templet=docs.jsp&sortfield=-docorderpri%2C-docreltime&classsql=chnlid%3D30170&prepage=15&page=" + str( i) html = get(url) parse_index(html)
def main(): for i in range(1, 5): print(i) url = "http://mzj.gz.gov.cn/gkmlpt/api/all/344?page=" + str( i) + "&sid=200022" html = get(url) parse_index(html)
def main(): for i in range(1, 18): print(i) url = "http://ypjgj.qinghai.gov.cn/Article/ArticlePageYJJ?ParentSectionName=%E4%BF%A1%E6%81%AF%E5%85%AC%E5%BC%80&Section_ID=377E4C95-BE0C-4277-B3F2-94AA44A373CB&page=" + str( i) + "&pageSize=15" html = get(url) parse_index(html)
def main(): for i in range(0, 1): print(i) if i == 0: url = "http://amr.yn.gov.cn/zwgk/jcgk/zcwj.htm" html = get(url) parse_index(html)
def main(): for i in range(1, 6): print(i) url = "http://mzt.gansu.gov.cn/station/gssmzt/mzfw/list/" + str( i) + ".html" html = get(url) parse_index(html)
def main(): for i in range(0, 1): print(i) if i == 0: url = "http://gzw.shandong.gov.cn/channels/ch05552/" html = get(url) parse_index(html)
def main(): for i in range(0,1): print(i) if i==0: url="http://hbgf.hebnews.cn/node_118305.htm" html=get(url) parse_index(html)
def main(): for i in range(1, 60): print(i) url = "http://jy.tj.gov.cn/info_list.jsp?classid=201707190828598801&curPage=" + str( i) html = get(url) parse_index(html)
def main(): for i in range(0, 3): print(i) url = "https://www.gzw.sh.gov.cn/website/html/shgzw/shgzw_flfg_zcfg_gfxwj/List/list_" + str( i) + ".htm" html = get(url) parse_index(html)
def main(): for i in range(1, 43): print(i) url = "http://jtyst.qinghai.gov.cn/tzgg/zfxxgk/zfxxgkml/zcfg/?pi=" + str( i) html = get(url) parse_index(html)
def main(): url="http://wjw.ah.gov.cn/xxgk_data.html?" for i in range(1,52): print(i) params={'rn': '0.10413866167152985', 'id': '', 'cp': str(i), 'year': '', 'rootid': '10035'} html=get(url,params=params) parse_index(html)
def main(): for i in range(1, 14): print(i) url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index_2603340_" + str( i) + ".html" html = get(url) parse_index(html)
def main(): for i in range(1, 75): print(i) url = "http://wsjkw.hlj.gov.cn/index.php/Home/News/all.shtml?typeid=33&p=" + str( i) html = get(url) parse_index(html)
def main(): for i in range(0,1): print(i) if i==0: url="http://gzw.jl.gov.cn/zcwj/" html=get(url,code="gbk") parse_index(html)
def main(): for i in range(0, 41): page_url = url.format(i) print(page_url) # driver.get(url=page_url) # time.sleep(2) # html = driver.page_source # soup = BeautifulSoup(html, 'lxml') # #cc = soup.select('pre')[0] # rs = json.loads(soup.select('pre')[0]) rs = json.loads(get(page_url)) time.sleep(3) #print(rs['searchVO']['catMap'].keys()) for t in types: tls = rs['searchVO']['catMap'][t]['listVO'] for item in tls: #print('{}|{}|{}|{}|{}'.format(item['title'],item['pubtimeStr'],item['url'],item['puborg'],item['subjectword'])) rs_list.append( (item['pubtimeStr'], t, item['title'], item['url'], item['puborg'], item['childtype'], item['subjectword'])) rs_df = pd.DataFrame(rs_list) rs_df.columns = [ 'pubtime', 'type', 'title', 'url', 'puborg', 'childtype', 'subjectword' ] rs_df['pubtime'] = rs_df.pubtime.apply(lambda x: x.replace('.', '-')) rs_df['title'] = rs_df.title.apply(lambda x: remove_style(x)) rs_df['pubtime'] = pd.to_datetime(rs_df['pubtime']) rs_df = rs_df.sort_values('pubtime', ascending=False) print(rs_df.pubtime.head()) print(rs_df.shape) rs_df.columns = ['发布时间', '类型', '标题', 'url', '发文机关', '主题分类', '主题词'] rs_df.to_excel('摘要.xlsx', index=False)
def main(): for i in range(1, 4): print(i) url = "http://mzt.fujian.gov.cn/was5/web/search?channelid=229105&sortfield=-docreltime,-docorder&extension=&templet=docs.jsp&classsql=siteid%3D46*chnlid%3D18040&searchword=&prepage=20&page=" + str( i) + "&r=0.26315070612868396" html = get(url) parse_index(html)
def main(): for i in range(0, 1): print(i) if i == 0: url = "http://wsjkw.gd.gov.cn/hdjl/hdjl_zcwj/index.html" html = get(url) parse_index(html)
def main(): for i in range(2, 28): print(i) url = "http://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?Page=" + str( i) + "&chid=100127" html = get(url, code="gbk") parse_index(html)
def main(): for i in range(1, 5): print(i) url = "http://fujian.chinatax.gov.cn/was5/web/search?channelid=203958&templet=docs.jsp&sortfield=-docorderpri%2C-docreltime&classsql=%20chnlid%3D%2021694*chnlid%3D21694&random=0.8127466147433084&prepage=15&page=" + str( i) html = get(url) parse_index(html)