Example #1
0
def main():
    for i in range(32, 35):
        print(i)
        if i == 0:
            url = "http://yjj.hebei.gov.cn/CL0214/index.html"
        else:
            url = "http://yjj.hebei.gov.cn/CL0214/index_" + str(i) + ".html"
        try:
            html = get(url, code="gb2312")
        except:
            html = get(url, code="GB18030")
        parse_index(html)
    for i in range(0, 2):
        print(i)
        if i == 0:
            url = "http://yjj.hebei.gov.cn/CL0215/index.html"
        else:
            url = "http://yjj.hebei.gov.cn/CL0215/index_" + str(i) + ".html"
        html = get(url, code="gb2312")
        parse_index(html)
    for i in range(0, 1):
        print(i)
        if i == 0:
            url = "http://yjj.hebei.gov.cn/CL0434/"
        html = get(url, code="gb2312")
        parse_index(html)
Example #2
0
def main():
    for i in range(0, 1):
        print(i)
        url = "http://mzj.beijing.gov.cn/col/col6112/index.html"
        html = get(url)
        parse_index(html)
    for i in range(0, 1):
        print(i)
        url = "http://mzj.beijing.gov.cn/col/col4492/index.html"
        html = get(url)
        parse_index(html)
Example #3
0
def main():
    for i in range(0, 1):
        print(i)
        url = "http://jyt.hlj.gov.cn/zwgk/ghjh/fzgh/"
        html = get(url, code="gbk")
        parse_index(html, url_preifx="http://jyt.hlj.gov.cn/zwgk/ghjh/fzgh")
    for i in range(0, 1):
        print(i)
        url = "http://jyt.hlj.gov.cn/zwgk/ghjh/gzyd/"
        html = get(url, code="gbk")
        parse_index(html, url_preifx="http://jyt.hlj.gov.cn/zwgk/ghjh/gzyd")
Example #4
0
def main():
    for i in range(16, 89):
        print(i)
        url = "http://www.cbirc.gov.cn/cbircweb/DocInfo/SelectDocByItemIdAndChild?itemId=928&pageSize=18&pageIndex=" + str(
            i)
        html = get(url)
        parse_index(html)

    url = "http://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=927,pageIndex=1,pageSize=18.json"
    html = get(url)
    parse_index(html)
Example #5
0
def main():
    url="http://amr.hlj.gov.cn/index/affairlist.html?"
    for i in range(1,18):
        print(i)
        i=2
        params={'tid': '2', 'page': str(i)}
        html=get(url,params=params)
        parse_index(html)
    for i in range(1, 11):
        print(i)
        params = {'tid': '3', 'page': str(i)}
        html = get(url,params=params)
        parse_index(html)
Example #6
0
def main():
    for i in range(0, 1):
        print(i)
        if i == 0:
            url = "http://snamr.snaic.gov.cn/xw/szfwj.htm"
        html = get(url)
        parse_index(html)
Example #7
0
def main():
    url="http://www.xjzj.gov.cn/info/iList.jsp?"
    for i in range(1,3):
        print(i)
        params={'node_id': 'GKscjdj', 'site_id': 'CMSxjamr', 'cat_id': '10042', 'cur_page': str(i)}
        html=get(url,params=params)
        parse_index(html)
Example #8
0
def main():
    for i in range(1, 10):
        print(i)
        url = "http://www.yp.yn.gov.cn/newsite/NewsList.aspx?CID=b819ebae-9992-4331-bf4b-50974ce07b12&page=" + str(
            i)
        html = get(url)
        parse_index(html)
Example #9
0
def main():
    for i in range(1, 13):
        print(i)
        url = "http://gat.gxzf.gov.cn/n895440/n895445/n895495/index_898744_" + str(
            i) + ".html"
        html = get(url)
        parse_index(html)
Example #10
0
def main():
    for i in range(1, 24):
        print(i)
        url = "http://mzt.shaanxi.gov.cn/zwgk_list.rt?channlCid=0&channlId=41&pageNo=" + str(
            i)
        html = get(url)
        parse_index(html)
Example #11
0
def main():
    for i in range(1, 3):
        print(i)
        url = "http://yunnan.chinatax.gov.cn/col/col3851/index.html?uid=5306&pageNum=" + str(
            i)
        html = get(url)
        parse_index(html)
Example #12
0
def main():
    for i in range(1, 678):
        print(i)
        url = "http://jyt.fujian.gov.cn/was5/web/search?channelid=203116&templet=docs.jsp&sortfield=-docorderpri%2C-docreltime&classsql=chnlid%3D30170&prepage=15&page=" + str(
            i)
        html = get(url)
        parse_index(html)
Example #13
0
def main():
    for i in range(1, 5):
        print(i)
        url = "http://mzj.gz.gov.cn/gkmlpt/api/all/344?page=" + str(
            i) + "&sid=200022"
        html = get(url)
        parse_index(html)
Example #14
0
def main():
    for i in range(1, 18):
        print(i)
        url = "http://ypjgj.qinghai.gov.cn/Article/ArticlePageYJJ?ParentSectionName=%E4%BF%A1%E6%81%AF%E5%85%AC%E5%BC%80&Section_ID=377E4C95-BE0C-4277-B3F2-94AA44A373CB&page=" + str(
            i) + "&pageSize=15"
        html = get(url)
        parse_index(html)
Example #15
0
def main():
    for i in range(0, 1):
        print(i)
        if i == 0:
            url = "http://amr.yn.gov.cn/zwgk/jcgk/zcwj.htm"
        html = get(url)
        parse_index(html)
Example #16
0
def main():
    for i in range(1, 6):
        print(i)
        url = "http://mzt.gansu.gov.cn/station/gssmzt/mzfw/list/" + str(
            i) + ".html"
        html = get(url)
        parse_index(html)
Example #17
0
def main():
    for i in range(0, 1):
        print(i)
        if i == 0:
            url = "http://gzw.shandong.gov.cn/channels/ch05552/"
        html = get(url)
        parse_index(html)
Example #18
0
def main():
    for i in range(0,1):
        print(i)
        if i==0:
            url="http://hbgf.hebnews.cn/node_118305.htm"
        html=get(url)
        parse_index(html)
Example #19
0
def main():
    for i in range(1, 60):
        print(i)
        url = "http://jy.tj.gov.cn/info_list.jsp?classid=201707190828598801&curPage=" + str(
            i)
        html = get(url)
        parse_index(html)
Example #20
0
def main():
    for i in range(0, 3):
        print(i)
        url = "https://www.gzw.sh.gov.cn/website/html/shgzw/shgzw_flfg_zcfg_gfxwj/List/list_" + str(
            i) + ".htm"
        html = get(url)
        parse_index(html)
Example #21
0
def main():
    for i in range(1, 43):
        print(i)
        url = "http://jtyst.qinghai.gov.cn/tzgg/zfxxgk/zfxxgkml/zcfg/?pi=" + str(
            i)
        html = get(url)
        parse_index(html)
Example #22
0
def main():
    url="http://wjw.ah.gov.cn/xxgk_data.html?"
    for i in range(1,52):
        print(i)
        params={'rn': '0.10413866167152985', 'id': '', 'cp': str(i), 'year': '', 'rootid': '10035'}
        html=get(url,params=params)
        parse_index(html)
Example #23
0
def main():
    for i in range(1, 14):
        print(i)
        url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index_2603340_" + str(
            i) + ".html"
        html = get(url)
        parse_index(html)
Example #24
0
def main():
    for i in range(1, 75):
        print(i)
        url = "http://wsjkw.hlj.gov.cn/index.php/Home/News/all.shtml?typeid=33&p=" + str(
            i)
        html = get(url)
        parse_index(html)
Example #25
0
def main():
    for i in range(0,1):
        print(i)
        if i==0:
            url="http://gzw.jl.gov.cn/zcwj/"
        html=get(url,code="gbk")
        parse_index(html)
Example #26
0
def main():
    for i in range(0, 41):
        page_url = url.format(i)
        print(page_url)

        # driver.get(url=page_url)
        # time.sleep(2)
        # html = driver.page_source
        # soup = BeautifulSoup(html, 'lxml')
        # #cc = soup.select('pre')[0]
        # rs = json.loads(soup.select('pre')[0])

        rs = json.loads(get(page_url))
        time.sleep(3)
        #print(rs['searchVO']['catMap'].keys())
        for t in types:
            tls = rs['searchVO']['catMap'][t]['listVO']
            for item in tls:
                #print('{}|{}|{}|{}|{}'.format(item['title'],item['pubtimeStr'],item['url'],item['puborg'],item['subjectword']))
                rs_list.append(
                    (item['pubtimeStr'], t, item['title'], item['url'],
                     item['puborg'], item['childtype'], item['subjectword']))
    rs_df = pd.DataFrame(rs_list)
    rs_df.columns = [
        'pubtime', 'type', 'title', 'url', 'puborg', 'childtype', 'subjectword'
    ]
    rs_df['pubtime'] = rs_df.pubtime.apply(lambda x: x.replace('.', '-'))
    rs_df['title'] = rs_df.title.apply(lambda x: remove_style(x))
    rs_df['pubtime'] = pd.to_datetime(rs_df['pubtime'])
    rs_df = rs_df.sort_values('pubtime', ascending=False)
    print(rs_df.pubtime.head())
    print(rs_df.shape)
    rs_df.columns = ['发布时间', '类型', '标题', 'url', '发文机关', '主题分类', '主题词']
    rs_df.to_excel('摘要.xlsx', index=False)
Example #27
0
def main():
    for i in range(1, 4):
        print(i)
        url = "http://mzt.fujian.gov.cn/was5/web/search?channelid=229105&sortfield=-docreltime,-docorder&extension=&templet=docs.jsp&classsql=siteid%3D46*chnlid%3D18040&searchword=&prepage=20&page=" + str(
            i) + "&r=0.26315070612868396"
        html = get(url)
        parse_index(html)
Example #28
0
def main():
    for i in range(0, 1):
        print(i)
        if i == 0:
            url = "http://wsjkw.gd.gov.cn/hdjl/hdjl_zcwj/index.html"
        html = get(url)
        parse_index(html)
Example #29
0
def main():
    for i in range(2, 28):
        print(i)
        url = "http://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?Page=" + str(
            i) + "&chid=100127"
        html = get(url, code="gbk")
        parse_index(html)
Example #30
0
def main():
    for i in range(1, 5):
        print(i)
        url = "http://fujian.chinatax.gov.cn/was5/web/search?channelid=203958&templet=docs.jsp&sortfield=-docorderpri%2C-docreltime&classsql=%20chnlid%3D%2021694*chnlid%3D21694&random=0.8127466147433084&prepage=15&page=" + str(
            i)
        html = get(url)
        parse_index(html)