コード例 #1
0
ファイル: temp10.py プロジェクト: Yuan-zewei/jzsc
 def qyjcxx(self):
     try:
         a = Mysql.qiyexx_url(bh='1')[0]  # 从复爬表
         if a == None:
             print('没有数据可以爬取')
             time.sleep(10)
         else:
             self.qyid = a[0]  #eid
             self.z = a[2]  #公司名字
             qw = self.gx_qyid()  #这个东西可以优化,在失败或者加载不出东西可以尝试更新,不用每次加载
             self.qyid1 = qw  #qyid
             if a[7] == '1':
                 self.jichu12()  # 基础信息的爬取
             else:
                 print('基础信息爬取完毕')
             if a[8] == '1':
                 self.qyzz()  # 资质信息的爬取
             else:
                 print('资质信息爬取完毕')
             if a[9] == '1':
                 self.qy_user()  # 人员信息的爬取
             else:
                 print('人员信息爬取完毕')
             # self.gcxmxx()
             a = Mysql.qiyexx_url(bh='1')[0]
             if a[7] == '0' and (a[8] == '0'
                                 or a[8] == '404') and (a[9] == '0'
                                                        or a[9] == '404'):
                 Mysql.gxqy_fupa(cx_state='0', eid=self.qyid)
                 print('状态更新完毕')
     except Exception as e:
         print(e, 'jgfufh')
コード例 #2
0
def pingxiang2():
    url='http://pxdpc.pingxiang.gov.cn/list.asp?classid=15'
    tt = requests.get(url).content.decode('utf-8')
    pages = re.findall('每页20条, 1/(\d+)页', tt)[0]
    print(f'共{pages}页')
    for page in range(1, int(pages) + 1):
        url1=url+f'&p={page}'
        tt = requests.get(url1).content.decode('utf-8').replace('\n', '').replace('\r', '').replace('\t', '')
        contents = re.findall('&nbsp;                    <a href="(.*?)" target="_blank">(.*?)</a></td>                  <td width="11%" class="font_hui12">\[(.*?)\]</td>', tt)
        for content in contents:
            linkurl = 'http://pxdpc.pingxiang.gov.cn/' + content[0]
            detail_res = requests.get(linkurl).content.decode('utf-8').replace('/upload/','http://pxdpc.pingxiang.gov.cn/upload/')
            Html = etree.HTML(detail_res)
            # qufen = '发改委'+Html.xpath("//table[1]/tbody/tr/td[@class='font_hui12']/a[3]")[0]  # 当前栏目
            div1 = Html.xpath("/html/body/div[5]")[0]  # text
            infocontent = html.unescape(etree.tostring(div1, method='html').decode()).replace("'", " ").replace(
                '"', ' ')  # html
            title = content[1]
            publicTime = content[2].replace('                    ','')
            select = Mysql.select_xinwen(title=title)  # 查询标题是否存在
            if len(select)==0:
                uid = uuid.uuid4()
                Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='360000', regionName='江西省', areaRegion='萍乡市',
                                             publicTime=publicTime, linkurl=linkurl, title=title,
                                             dataResource='', yewuType='发改委', infoType='', infoState='', isok='',
                                             isdeal='')
                Mysql.insert_xinwen_detailinfo(uid=uid, infocontent=infocontent)
            else:
                print(f'第{page}页标题存在')
        print(f'第{page}页已爬完')
コード例 #3
0
def ryxx_xinxi(resp, user, zc_dwid):
    try:
        name = resp['RY_NAME']
        zclb = resp['REG_TYPE_NAME']
        zsbh = resp['REG_CERTNO']
        if zsbh == None:
            zsbh = ''
        zyyzh = resp['REG_SEAL_CODE']
        a21 = resp['REG_EDATE']
        yxq = time_s(a21)
        zc_dw = resp['QY_NAME']
        zc_zy = resp['REG_PROF_NAME']
        drjs = ''
        Mysql.inserttbl_user_zcxx_log1(userid=user,
                                       zclb=zclb,
                                       zsbh=zsbh,
                                       zyyzh=zyyzh,
                                       yxq=yxq,
                                       zc_dwid=zc_dwid,
                                       zc_dw=zc_dw,
                                       zc_zy=zc_zy,
                                       drjs=drjs)
        print(f'{name}{user}注册信息插入完成')
    except Exception as e:
        print('注册信息报错', e)
コード例 #4
0
def chuli(publictime,href,driver,url,title,city,xpath1):
    try:
        insertDBtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
        if re.findall('http', href):
            link = href

        elif '../' in href:
            driver.find_element_by_xpath(f"{xpath1}/a").click()
            b_handle = driver.current_window_handle  # 获取当前页句柄
            handles = driver.window_handles  # 获取所有页句柄
            s_handle = None
            for handle in handles:
                if handle != b_handle:
                    s_handle = handle
            driver.switch_to.window(s_handle)  # 在新窗口操作
            link = driver.current_url  # 2级页面的url
            driver.close()
            driver.switch_to.window(b_handle)  # 在新窗口操作
        elif './' in href:
            link = url + href.replace('./', '')
        elif href[0] == '/':
            if re.findall(r'http(.*?)\.cn', url):
                link = 'http' + re.findall(r'http(.*?)\.cn', url)[0] + '.cn' + href
            else:
                link = 'http' + re.findall(r'http(.*?)\.com', url)[0] + '.cn' + href
        else:
            link = 'http' + re.findall(r'http(.*?)\.cn', url)[0] + '.cn/'+href
        uid = uuid.uuid4()
        Mysql.insert_xw_nr(prid=uid, shengfen=pro, dijishi=city, fabutime=publictime, url=link,
                           biaoti=title, tianjiatime=insertDBtime, zt='0')
        print(f'--{city}-【{title}】写入成功')

    except Exception as e:
        print('处理\t', e)
コード例 #5
0
def qyzz(resp, qyid):
    # print(resp)
    try:
        zzlb = resp['APT_TYPE_NAME']
        zzzsh = resp['APT_CERTNO']
        zzmc = resp['APT_NAME']
        a1 = resp['APT_GET_DATE']
        a2 = resp['APT_EDATE']
        fzrq = time_s(a1)
        zsyxq = time_s(a2)
        fzjg = resp['APT_GRANT_UNIT']
        zc_fw = resp['APT_NAME']
        cx = Mysql.selecttbl_qy_zz(qyid=qyid, zsh=zzzsh, zzmc=zzmc)
        if cx == None:
            Mysql.inserttbl_qy_zz(zzlx=zzlb,
                                  zsh=zzzsh,
                                  zzmc=zzmc,
                                  fzrq=fzrq,
                                  zsyxq=zsyxq,
                                  fzjg=fzjg,
                                  qyid=qyid,
                                  zzfw=zc_fw)
        else:
            Mysql.updatetbl_qy_zz(zzlx=zzlb,
                                  zsh=zzzsh,
                                  zzmc=zzmc,
                                  fzrq=fzrq,
                                  zsyxq=zsyxq,
                                  fzjg=fzjg,
                                  qyid=qyid,
                                  zc_fw=zc_fw)
    except Exception as e:
        util.logger.error(e)
コード例 #6
0
ファイル: hebei.py プロジェクト: liman21/xinwen
def zhangjiakou():
    try:
        for page in range(1, 374):
            url1s = [
                f'http://www.zjk.gov.cn/syscolumn/dt/zjkyw/index.html',  # 张家口要闻
                f'http://www.zjk.gov.cn/syscolumn/dt/zjkyw/index_{page}.html',  # 张家口要闻
                f'http://www.zjk.gov.cn/bmgz_frame1.jsp?pages={page}',  # 部门工作
            ]
            for url1 in url1s:
                contents1 = requests.get(
                    url1, proxies=ipmax()).content.decode('utf-8').replace(
                        '\n', '').replace('\r', '').replace('\t', '')
                contents = [
                    re.findall(
                        '"hg" href="(.*?)" target="_blank" title="(.*?)">(.*?)</a></td>                    <td width="80" class="cdate">\[(.*?)\]</td>',
                        contents1),
                    re.findall(
                        'hg" href="(.*?)" title="(.*?)" target="_blank">(.*?)</a></td>              <td width="100" class="cdate">\[(.*?)\]</td>',
                        contents1),
                ]
                for content in contents:
                    if len(content) > 0:
                        content = content[0]
                        uu = re.findall('www.(.*?).gov', url1)[0]
                        linkurl = f'http://www.{uu}.gov.cn' + content[0].strip(
                        )
                        detail_res = requests.get(linkurl).content.decode(
                            'utf-8')
                        Html = etree.HTML(detail_res)
                        infocontent = html.unescape(
                            etree.tostring(Html,
                                           method='html').decode()).replace(
                                               "'", " ").replace('"', ' ')
                        title = content[1].strip()
                        publicTime = content[3].strip()
                        select = Mysql.select_xinwen(title=title)  # 查询标题是否存在
                        if select == None:
                            uid = uuid.uuid4()
                            Mysql.insert_xinwen_baseinfo(uid=uid,
                                                         regionCode='067000',
                                                         regionName='河北省',
                                                         areaRegion='承德市',
                                                         publicTime=publicTime,
                                                         linkurl=linkurl,
                                                         title=title,
                                                         dataResource='',
                                                         yewuType='',
                                                         infoType='',
                                                         infoState='',
                                                         isok='',
                                                         isdeal='')
                            Mysql.insert_xinwen_detailinfo(
                                uid=uid, infocontent=infocontent)
                        else:
                            print('标题存在')
    except Exception as e:
        print('蚌埠\t', e)
        return zhangjiakou()
コード例 #7
0
def get_id(company):
    url = f'https://www.qcc.com/search?key={company}'
    now = int(time.time())
    ts = int(datetime.datetime.now().timestamp() * 1000)
    tt = f'"sid": {ts},"updated": {ts},'
    headers = {
        'authority':
        'www.qcc.com',
        'method':
        'GET',
        # 'path': f'/search?key={company}',
        'scheme':
        'https',
        'accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'accept-encoding':
        'gzip, deflate, br',
        'accept-language':
        'zh-CN,zh;q=0.9',
        # 'acw_tc=7a0e2b8515954846765241789e78d804e5d0040e63cb99094bed4b647c;'
        'cookie':
        f'Hm_lpvt_78f134d5a9ac3f92524914d0247e70cb=1596013637;acw_tc=6f7e789715960136364107111e658a606113c5b7d4e0de41cce42be832;UM_distinctid=17399d36cce204-07ef35a7b6f16c-b363e65-13c680-17399d36ccf38f;QCCSESSID=6odkg7m8oc4c7gmqapbplludk3;_uab_collina=159601363669250647322886;zg_did=%7B%22did%22%3A%20%2217399d36c1162-0187c178a0c29b-b363e65-13c680-17399d36c1265b%22%7D;Hm_lvt_78f134d5a9ac3f92524914d0247e70cb=1596013637;CNZZDATA1254842228=307711671-1596010379-%7C1596010379;zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201596013636631%2C%22updated%22%3A%201596013637203%2C%22info%22%3A%201596013636636%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%7D',
        # 'referer': f'https://www.qcc.com/search?key={company}',
        'sec-fetch-dest':
        'document',
        'sec-fetch-mode':
        'navigate',
        'sec-fetch-site':
        'same-origin',
        'sec-fetch-user':
        '******',
        'upgrade-insecure-requests':
        '1',
        'user-agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
    }
    data = {'key': f'{company}'}
    f = Mysql.select_qycookie()
    cookies = f[1][16:-2]
    cookie = str(cookies[2:-2]).replace('": "', '=').replace('", "', ';')
    headers['cookie'] = cookie
    con = requests.get(url, headers=headers, params=data,
                       proxies=ipmax()).content.decode('utf-8').replace(
                           '\r', '').replace('\t',
                                             '').replace('\n', '').replace(
                                                 ' ', '').replace("'", '')
    if 'location' in con[0:30] or 'varexpiredate' in con[:-50]:
        Mysql.delete_qycookie(uid=f[0])
        return get_id(company)
    else:
        qyid = re.findall(
            f"内容类型:企业,内容名称:{company},内容链接:/firm/(.*?).html,内容位置:第1个", con)
        if qyid:
            qyid = qyid[0]
            return qyid
        else:
            print('ff')
コード例 #8
0
def ryxx(resp, qyid, user):
    try:
        name = resp['RY_NAME']
        zjhm = resp['IDCARD']
        zczy = resp['RY_CARDTYPE_NAME']
        sex = resp['RY_SEX_NAME']
        ues = Mysql.selecttbl_qiye_user_qyid(username=name,
                                             sex=sex,
                                             zjlx=zczy,
                                             zjhm=zjhm,
                                             qyid=qyid)
        print(ues)
        if ues != None:
            Mysql.delete_tbl_user_user(userid=ues[0])
            Mysql.deletetbl_user_zcxx1_user(userid=ues[0])
            print('这个人员已存在,删除人员的基础注册信息')
            Mysql.inserttbl_user(username=name,
                                 sex=sex,
                                 zjlx=zczy,
                                 zjhm=zjhm,
                                 qyid=qyid,
                                 userid=user)
            print(f'{name}{user}基础信息插入完成')
        else:
            Mysql.inserttbl_user(username=name,
                                 sex=sex,
                                 zjlx=zczy,
                                 zjhm=zjhm,
                                 qyid=qyid,
                                 userid=user)
            print(f'{name}{user}基础信息插入完成')
    except Exception as e:
        util.logger.error(e)
コード例 #9
0
ファイル: hq_token.py プロジェクト: Yuan-zewei/jzsc
def gx_qyid(z, eid):
    print('开始更新企业id')
    qyurl = f'http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/comp/list?complexname={z}&pg=0&pgsz=15&total=0'
    resp1 = requests.get(url=qyurl, headers=headers)
    asddd2 = jd_nx(data=f'{resp1.text}')
    if len(asddd2['data']['list']) == 0:
        print('没有这个公司异常')
        Mysql.gxqy_fupa(cx_state='3', eid=eid)
    else:
        qyid = asddd2['data']['list'][0]['QY_ID']
        Mysql.update_qyid(qyurl=qyid, eid=eid)  # 更新企业id
        return qyid
コード例 #10
0
ファイル: hebei.py プロジェクト: liman21/xinwen
def shijiazhuang():
    url1s = [
        # 'http://www.sjz.gov.cn/column.jsp?id=1490076462404',  # 市政要闻
        'http://www.sjz.gov.cn/column.jsp?id=1490076534390',  # 部门动态
        'http://www.sjz.gov.cn/column.jsp?id=1490076571666',  # 区县动态
    ]
    for url1 in url1s:
        tt = requests.get(url1).content.decode('gb2312')
        pages = re.findall("title='每页显示.*记录'>共.*条(\d+)页", tt)[0]
        for page in range(1, int(pages) + 1):
            url = f'{url1}&current={page}'
            contents1 = requests.get(url1).content.decode('gb2312').replace(
                '\n', '').replace('\r', '').replace('\t', '')
            contents2 = re.findall(
                '1 list_2"><ul>(.*?)/ul></div></div><div style="text-align:',
                contents1)
            contents = re.findall(
                'href="(.*?)" target="_blank"  style="line-height:30px;" title="(.*?)">(.*?)</a>&nbsp;<span class="date" style="color:#898989">(.*?)</span>',
                contents2[0])
            for content in contents:
                linkurl = 'http://www.sjz.gov.cn' + content[0]
                detail_res = requests.get(linkurl).content.decode('gb2312')
                Html = etree.HTML(detail_res)
                div = Html.xpath("/html/body/div/div[2]")[0]
                infocontent = html.unescape(
                    etree.tostring(div, method='html').decode()).replace(
                        "'", " ").replace('"', ' ')
                title = content[1]
                publicTime = content[3]
                select = Mysql.select_xinwen(title=title)  # 查询标题是否存在
                if select == None:
                    uid = uuid.uuid4()
                    Mysql.insert_xinwen_baseinfo(uid=uid,
                                                 regionCode='050000',
                                                 regionName='河北省',
                                                 areaRegion='石家庄市',
                                                 publicTime=publicTime,
                                                 linkurl=linkurl,
                                                 title=title,
                                                 dataResource='',
                                                 yewuType='',
                                                 infoType='',
                                                 infoState='',
                                                 isok='',
                                                 isdeal='')
                    Mysql.insert_xinwen_detailinfo(uid=uid,
                                                   infocontent=infocontent)
                else:
                    print('标题存在')
                print('gg')
コード例 #11
0
ファイル: hebei.py プロジェクト: liman21/xinwen
def chengde():
    for page in range(1, 374):
        url1s = [
            f'http://www.chengde.gov.cn/col/col360/index.html?uid=1412&pageNum={page}',  # 本市要闻  1361
            # 'http://www.chengde.gov.cn/col/col361/index.html?uid=1412&pageNum={page}',    # 外媒看承德  367
            # 'http://www.chengde.gov.cn/col/col362/index.html?uid=1412&pageNum={page}',    # 外媒看承德  374
            # 'http://www.chengde.gov.cn/col/col364/index.html?uid=1412&pageNum={page}',    # 公示公告    27
        ]
        for url1 in url1s:
            contents1 = requests.get(url1).content.decode('utf-8').replace(
                '\n', '').replace('\r', '').replace('\t', '')
            contents = re.findall('pan><a (.*?)</span>', contents1)
            for content in contents:
                co = re.findall("href=\\'(.*?)\\'title=\\'(.*?)\\'target",
                                content)[0]
                co1 = re.findall(
                    'target="_blank">(.*?)</a><span class="bt-data-time"style="font-size:14px;">\[(.*?)\]',
                    content)[0]

                linkurl = 'http://www.chengde.gov.cn' + co[0]
                detail_res = requests.get(linkurl).content.decode('utf-8')
                Html = etree.HTML(detail_res)
                # div = Html.xpath("/html/body/div/div[2]")[0]
                infocontent = html.unescape(
                    etree.tostring(Html, method='html').decode()).replace(
                        "'", " ").replace('"', ' ')
                title = co[1]
                publicTime = co1[1]
                select = Mysql.select_xinwen(title=title)  # 查询标题是否存在
                if select == None:
                    uid = uuid.uuid4()
                    Mysql.insert_xinwen_baseinfo(uid=uid,
                                                 regionCode='067000',
                                                 regionName='河北省',
                                                 areaRegion='承德市',
                                                 publicTime=publicTime,
                                                 linkurl=linkurl,
                                                 title=title,
                                                 dataResource='',
                                                 yewuType='',
                                                 infoType='',
                                                 infoState='',
                                                 isok='',
                                                 isdeal='')
                    Mysql.insert_xinwen_detailinfo(uid=uid,
                                                   infocontent=infocontent)
                else:
                    print('标题存在')
コード例 #12
0
def chuli1(publictime, href, url, title, city):
    try:
        insertDBtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
        if re.findall('http', href):
            link = href
        elif './' in href:
            link = url + href.replace('./', '')
        elif href[0] == '/':
            link = 'http' + re.findall(r'http(.*?)\.cn', url)[0] + '.cn' + href
        else:
            link = 'http' + re.findall(r'http(.*?)\.cn', url)[0] + '.cn/' + href
        uid = uuid.uuid4()
        Mysql.insert_xw_nr(prid=uid, shengfen=pro, dijishi=city, fabutime=publictime, url=link,
                           biaoti=title, tianjiatime=insertDBtime, zt='0')
        print(f'--{city}-【{title}】写入成功')

    except Exception as e:
        print('处理\t', e)
コード例 #13
0
def qyjichu(resp, qyid):
    print(resp)
    #企业名字
    gsname = resp['QY_NAME']
    # 统一社会信用代码
    xydm = resp['QY_ORG_CODE']
    # 企业法定代表人
    qyfr = resp['QY_FR_NAME']
    # 企业登记注册类型
    qytype = resp['QY_GSZCLX_NAME']
    # 企业注册属地
    qysd = resp['QY_REGION_NAME']
    # 企业经营地址
    qyAdr = resp['QY_ADDR']
    qy = Mysql.selecttbl_qy(qyid=qyid)
    # # 查询复爬表中有无资质人员工程等url
    print(qy)
    if qy == None:
        print(
            f'-------------------------------{gsname}的基础信息正在插入--------------------------------'
        )
        Mysql.inserttbl_qy(qyid=qyid,
                           xydm=xydm,
                           zjjgid="",
                           qyname=gsname,
                           frdb=qyfr,
                           qyzcsd="",
                           zclx=qytype,
                           zcsd=qysd,
                           jydz=qyAdr)

    else:
        print(
            f'-------------------------------{gsname}的基础信息正在更新--------------------------------'
        )
        Mysql.updatetbl_qy(qyid=qyid,
                           xydm=xydm,
                           zjjgid="",
                           frdb=qyfr,
                           qyzcsd="",
                           zclx=qytype,
                           zcsd=qysd,
                           jydz=qyAdr,
                           qyname=gsname)
コード例 #14
0
ファイル: temp10.py プロジェクト: Yuan-zewei/jzsc
 def jichu12(self):
     # 基础信息
     try:
         qy_jichu = f'http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/comp/compDetail?compId={self.qyid1}'
         resp1 = requests.get(url=qy_jichu,
                              headers=self.headers,
                              proxies=self.ip,
                              timeout=100)
         if resp1.text.find('服务器繁忙,请稍后重试') != -1:
             print('服务器繁忙,请稍后重试')
         else:
             asddd2 = self.jd_nx(data=f'{resp1.text}')
             if asddd2['code'] != 200:
                 # self.hq_token(qyid=self.qyid1, name=self.z)#调用selenuim获得token值
                 Mysql.dele_token(token=self.jichu)
                 print('token删除成功')
                 Mysql.token(token=self.jichu)
                 self.jichu = self.jichutoken()[0]
                 self.ip = {
                     "http": "http://" + self.jichutoken()[1],
                     "https": "https://" + self.jichutoken()[1]
                 }
             else:
                 if asddd2['data'] == None:
                     self.gx_qyid()
                 else:
                     qyxx.qyjichu(asddd2['data']['compMap'], qyid=self.qyid)
                     Mysql.update_qyjcxx(qy_jcxx_zt='0', eid=self.qyid)
                     return '0'
     except Exception as e:
         qq = str(e)
         if qq.find("HTTPConnectionPool") != -1:
             print('ip失效')
             Mysql.dele_token(token=self.jichu)
             print('token删除成功')
             Mysql.token(token=self.jichu)
             self.jichu = self.jichutoken()[0]
             self.ip = {
                 "http": "http://" + self.jichutoken()[1],
                 "https": "https://" + self.jichutoken()[1]
             }
         else:
             print('不存在')
             print(e, '基础信息错误')
コード例 #15
0
ファイル: base.py プロジェクト: Yuan-zewei/jzsc
 def jichutoken(self):
     try:
         while True:
             a = Mysql.jichutoken(yxq='0')
             if a:
                 return a
             else:
                 time.sleep(5)
     except Exception as e:
         print(e, '文件错误')
コード例 #16
0
ファイル: hebei.py プロジェクト: liman21/xinwen
def shengyw():
    url = 'http://www.hebei.gov.cn/hebei/13863674/13871225/index.html'
    tt = requests.get(url).content.decode('utf-8')
    pages = re.findall('totalpage="(\d+)"', tt)[0]
    for page in range(1, int(pages) + 1):
        url1 = f'http://www.hebei.gov.cn/eportal/ui?pageId=13871225&currentPage={page}'
        tt = requests.get(url1).content.decode('utf-8').replace(
            '\n', '').replace('\r', '').replace('\t', '')
        contents = re.findall(
            '<a href="(.*?)" onclick="void\(0\)" target="_blank" title="(.*?)" istitle="true">(.*?)</a> <span class="date" style="font-size: 12px;color: #898989;padding-left: 5px;">(.*?)</span> </li>',
            tt)
        for content in contents:
            linkurl = 'http://www.hebei.gov.cn' + content[0]
            detail_res = requests.get(linkurl).content.decode('utf-8')
            Html = etree.HTML(detail_res)
            div = Html.xpath(
                '//*[@id="fadd83fc626241d9937b20353ca675eb"]/div[2]')[0]
            infocontent = html.unescape(
                etree.tostring(div, method='html').decode()).replace(
                    "'", " ").replace('"', ' ')
            title = content[1]
            publicTime = content[3]
            select = Mysql.select_xinwen(title=title)  # 查询标题是否存在
            if len(select) == 0:
                uid = uuid.uuid4()
                Mysql.insert_xinwen_baseinfo(uid=uid,
                                             regionCode='050000-075000',
                                             regionName='河北省',
                                             areaRegion='河北省',
                                             publicTime=publicTime,
                                             linkurl=linkurl,
                                             title=title,
                                             dataResource='',
                                             yewuType='',
                                             infoType='',
                                             infoState='',
                                             isok='',
                                             isdeal='')
                Mysql.insert_xinwen_detailinfo(uid=uid,
                                               infocontent=infocontent)
            else:
                print('标题存在')
コード例 #17
0
ファイル: guo.py プロジェクト: liman21/xinwen
def guo():  # 国务院新闻
    url = 'http://sousuo.gov.cn/column/19423/0.htm'
    tt = requests.get(url).content.decode('utf-8')
    pages = re.findall('共(\d+)页', tt)[0]
    for page in range(int(pages)):
        url1 = f'http://sousuo.gov.cn/column/19423/{page}.htm'
        tt1 = requests.get(url1).content.decode('utf-8').replace(
            '\n', '').replace('\r', '').replace('\t', '')
        contents = re.findall(
            '<li><h4><a href="(.*?)" target="_blank">(.*?)</a><span class="date">(.*?)</span></h4></li>',
            tt1)
        for content in contents:
            linkurl = content[0]
            detail_res = requests.get(linkurl).content.decode('utf-8')
            Html = etree.HTML(detail_res)
            div = Html.xpath('/html/body/div[3]/div[2]/div[1]')[0]
            infocontent = html.unescape(
                etree.tostring(div, method='html').decode()).replace(
                    "'", " ").replace('"', ' ')
            title = content[1]
            publicTime = content[2]
            select = Mysql.select_xinwen(title=title)  # 查询标题是否存在
            if len(select) == 0:
                uid = uuid.uuid4()
                Mysql.insert_xinwen_baseinfo(uid=uid,
                                             regionCode='000000',
                                             regionName='国务院',
                                             areaRegion='全国',
                                             publicTime=publicTime,
                                             linkurl=linkurl,
                                             title=title,
                                             dataResource='',
                                             yewuType='',
                                             infoType='',
                                             infoState='',
                                             isok='',
                                             isdeal='')
                Mysql.insert_xinwen_detailinfo(uid=uid,
                                               infocontent=infocontent)
            else:
                print('标题存在')
コード例 #18
0
def pingxiang():
    try:
        for num in range(1,4):
            url=f'http://www.jxsggzy.cn/web/xwzx/00700{num}/1.html'
            tt = requests.get(url).content.decode('utf-8')
            pages = re.findall('id="index">1/(\d+)</span>', tt)[0]
            print(f'江西省公共交易中心共{pages}页')
            for page in range(1, int(pages) + 1):
                url1=url.replace('1.html',f'{page}.html')
                tt = requests.get(url1).content.decode('utf-8').replace('\n', '').replace('\r', '').replace('\t', '')
                contents = re.findall('<li class="ewb-list-node clearfix">                            <a href="(.*?)"  title="(.*?)" target="_blank" class="ewb-list-name">(.*?)</a>                            <span class="ewb-list-date">(.*?)</span> ', tt)
                for con in range(1,len(contents)):
                    content=contents[con]
                    title = content[1]
                    publicTime = content[3]
                    linkurl = 'http://www.jxsggzy.cn' + content[0]
                    if re.findall('pdf|doc',content[0]):
                        infocontent='<embed src="'+linkurl+'" >'
                        urllib.request.urlretrieve(quote(linkurl, safe='/:?='), r'D:\lm\xinwen\江西省公共资源交易中心\\' + title + '.jpg')
                    else:
                        detail_res = requests.get(linkurl).content.decode('utf-8')
                        Html = etree.HTML(detail_res)
                        qufen='江西省公共交易中心'+Html.xpath("//p[@class='ewb-location-content']/span/text()")[0]
                        infocontent = html.unescape(etree.tostring(Html, method='html').decode()).replace("'", " ").replace(
                            '"', ' ')  # html
                    select = Mysql.select_xinwen(title=title)  # 查询标题是否存在
                    if len(select)==0:
                        uid = uuid.uuid4()
                        Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='360000', regionName='江西省', areaRegion='萍乡市',
                                                     publicTime=publicTime, linkurl=linkurl, title=title,
                                                     dataResource='', yewuType='江西省公共交易中心', infoType='', infoState='', isok='',
                                                     isdeal='')
                        Mysql.insert_xinwen_detailinfo(uid=uid, infocontent=infocontent)
                        print(f'{num} 标题【{title}】写入成功')
                    else:
                        print(f'{num} 标题【{title}】存在')
                print('-'*50+f'{num} 江西省公共交易中心第{page}页已写完'+'-'*50)
    except Exception as e:
        print('蚌埠\t', e)
        return pingxiang()
コード例 #19
0
ファイル: temp10.py プロジェクト: Yuan-zewei/jzsc
 def gx_qyid(self):
     try:
         print('开始更新企业id')
         qyurl = f'http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/comp/list?complexname={self.z}&pg=0&pgsz=15&total=0'
         resp1 = requests.get(url=qyurl,
                              headers=self.headers,
                              proxies=self.ipz(),
                              timeout=10)
         ew = resp1.text
         if str(ew).find('服务器繁忙,请稍后重试') != -1:
             print('服务器繁忙,请稍后重试')
         else:
             asddd2 = self.jd_nx(data=f'{resp1.text}')
             if len(asddd2['data']['list']) == 0:
                 print('没有这个公司异常')
                 Mysql.gxqy_fupa(cx_state='3', eid=self.qyid)
             else:
                 qyid = asddd2['data']['list'][0]['QY_ID']
                 Mysql.update_qyid(qyurl=qyid, eid=self.qyid)  #更新企业id
                 print('企业更新完毕')
                 return qyid
     except Exception as e:
         qq = str(e)
         if qq.find("HTTPConnectionPool") != -1:
             print('ip失效')
             Mysql.dele_token(token=self.jichu)
             print('token删除成功')
             Mysql.token(token=self.jichu)
             self.jichu = self.jichutoken()[0]
             self.ip = {
                 "http": "http://" + self.jichutoken()[1],
                 "https": "https://" + self.jichutoken()[1]
             }
         else:
             print('不存在')
             print(e, '基础信息错误')
コード例 #20
0
def gcxm(resp, qyid, i):
    print('--该工程项目的部分信息--')
    try:
        xmid = resp['PRJNUM']  #项目编号
        if xmid == None:
            xmid = ''
        sjxmbh = resp['PROVINCEPRJNUM']  #省级项目编号
        if sjxmbh == None:
            sjxmbh = ''

        xmmc = resp['PRJNAME']  #项目名称
        if xmmc == None:
            xmmc = ''

        if resp['PROVINCE'] == None:
            resp['PROVINCE'] = ''
        else:
            if resp['CITY'] == None:
                resp['CITY'] = ''
                gsd = resp['PROVINCE']
            else:
                if resp['COUNTY'] == None:
                    gsd = resp['PROVINCE'] + '-' + resp['CITY']
                else:
                    gsd = resp['PROVINCE'] + '-' + resp['COUNTY'] + '-' + resp[
                        'CITY']

        xmlb = resp['PRJTYPENUM']  #项目类别
        if xmlb == None:
            xmlb = ''
        jsdw_bh = ''  #建设单位编号
        if jsdw_bh == None:
            jsdw_bh = ''
        jsdw = resp['BUILDCORPNAME']  #建设单位
        if jsdw == None:
            jsdw = ''
        jsdw_xydm = resp['BUILDCORPCODE']  # 建设单位信用代码
        if jsdw_xydm == None:
            jsdw_xydm = ''

        szqh = gsd  # 所在区划
        jsxz = resp['PRJPROPERTYNUM']  #建设性质
        if jsxz == None:
            jsxz = ''
        gzyt = resp['PRJFUNCTIONNUM']  #工程用途
        if gzyt == None:
            gzyt = ''

        ztz = resp['ALLINVEST']
        if ztz == None:
            ztz = ''
        else:
            ztz = str(ztz) + '(万元)'  #总投资
        zmj = resp['ALLAREA']
        if zmj == None:
            zmj = ''
        else:
            zmj = str(zmj) + '(平方米)'  #总面积

        lxjb = resp['PRJAPPROVALLEVELNUM']  # 立项级别
        if lxjb == None:
            lxjb = ''
        lxwh = resp['PRJAPPROVALNUM']  # 立项文号
        if lxwh == None:
            lxwh = ''

        if Mysql.selecttbl_qy_xm(qyid=qyid, xmid=xmid):
            Mysql.updatetbl_qy_xm(qyid=qyid,
                                  xmid=xmid,
                                  sjxmbh=sjxmbh,
                                  xmmc=xmmc,
                                  gsd=gsd,
                                  xmlb=xmlb,
                                  jsdw_bh=jsdw_bh,
                                  jsdw=jsdw,
                                  jsdw_xydm=jsdw_xydm,
                                  szqh=szqh,
                                  jsxz=jsxz,
                                  gzyt=gzyt,
                                  ztz=ztz,
                                  zmj=zmj,
                                  lxjb=lxjb,
                                  lxwh=lxwh)

        else:
            Mysql.inserttbl_qy_xm(qyid=qyid,
                                  xmid=xmid,
                                  sjxmbh=sjxmbh,
                                  xmmc=xmmc,
                                  gsd=gsd,
                                  xmlb=xmlb,
                                  jsdw_bh=jsdw_bh,
                                  jsdw=jsdw,
                                  jsdw_xydm=jsdw_xydm,
                                  szqh=szqh,
                                  jsxz=jsxz,
                                  gzyt=gzyt,
                                  ztz=ztz,
                                  zmj=zmj,
                                  lxjb=lxjb,
                                  lxwh=lxwh)
        print(f'         第{i}个项目{xmmc}的部分信息插入完成')
        # else:
        #     Mysql.updatetbl_user_zcxx(userid=user, zclb=zclb, zsbh=zsbh, zyyzh=zyyzh, yxq=yxq,
        #                               zc_dwid=zc_dwid, zc_dw=zc_dw, zc_zy=zc_zy, drjs=drjs)
        # print(f'{name}{user}注册信息更新完成')
    except Exception as e:
        print(e)
コード例 #21
0
ファイル: hq_token.py プロジェクト: Yuan-zewei/jzsc
def selenu(url, qyname, ip):
    print(f'开始尝试')
    caps = DesiredCapabilities.CHROME
    caps['loggingPrefs'] = {'performance': 'ALL'}
    caps = {
        'browserName': 'chrome',
        'loggingPrefs': {
            'browser': 'ALL',
            'driver': 'ALL',
            'performance': 'ALL',
        },
        'goog:chromeOptions': {
            'perfLoggingPrefs': {
                'enableNetwork': True,
            },
            'w3c': False,
        },
    }
    chromeOptions = webdriver.ChromeOptions()
    chromeOptions.add_experimental_option('w3c', False)
    chromeOptions.add_experimental_option(
        'excludeSwitches',
        ['enable-automation'])  #开始实验性功能非常牛叉的参数,防止网页发现你是selenuim
    chromeOptions.add_argument('--headless')
    chromeOptions.add_argument(f'--proxy-server=http://{ip}')  #隐藏浏览器
    driver = webdriver.Chrome(options=chromeOptions, desired_capabilities=caps)
    driver.maximize_window()
    driver.set_page_load_timeout(40)  #超过这个时间直接报错
    driver.get(f'http://jzsc.mohurd.gov.cn/data/company/detail?id={url}')
    a = 1
    while True:
        try:
            time.sleep(3)
            he1 = driver.page_source
            time.sleep(1)
            if he1.find('重新验证') != -1 and he1.find(f'{qyname}') == -1:
                # driver.switch_to.window(driver.window_handles[0])#切换窗口发现没啥用
                time.sleep(3)
                tijiao = driver.find_element_by_xpath(
                    '//*[@id="app"]/div/header/div[5]/div/div[3]/div/button[1]/span'
                )
                driver.execute_script("arguments[0].click();", tijiao)
                time.sleep(1)
                # driver.switch_to.window(driver.window_handles[0])#切换窗口
                hem = driver.page_source
                time.sleep(1)
                for ui in range(0, 6):
                    if hem.find('请完成安全验证') != -1 or hem.find(
                            f'{qyname}') == -1:
                        current_time = time.strftime(
                            "%Y-%m-%d-%H_%M_%S", time.localtime(time.time()))
                        current_time1 = time.strftime(
                            "%Y-%m-%d", time.localtime(time.time()))
                        time.sleep(0.5)
                        imgelement = driver.find_element_by_xpath(
                            '/html/body/div[2]/div[2]/div/div/div[2]')
                        locations = imgelement.location
                        sizes = imgelement.size
                        rangle = (int(locations['x'] + 20),
                                  int(locations['y'] + 20),
                                  int(locations['x'] + sizes['width'] - 20),
                                  int(locations['y'] + sizes['height'] - 20))
                        pfilename = '.\\image'  #路径错的话使用绝对路径
                        save_path = pfilename + '\\' + current_time1 + '_' + current_time + '.png'
                        time.sleep(1.5)
                        driver.save_screenshot(save_path)
                        img = Image.open(save_path)
                        jpg = img.convert('RGB')
                        jpg = img.crop(rangle)
                        path = pfilename + '\\' + current_time1 + '_' + current_time + '.png'
                        time.sleep(1)
                        jpg.save(path)
                        print("图片截取成功!")
                        chaojiying = Chaojiying_Client('账号', '密码',
                                                       '软件id')  # 用户中心>>软件ID
                        im = open(path, 'rb').read()
                        zuo = chaojiying.PostPic(im, 9103)
                        groups = zuo.get('pic_str').split('|')
                        locations_chaojiying = [[
                            int(number) for number in group.split(',')
                        ] for group in groups]
                        if len(locations_chaojiying) > 0:
                            element = WebDriverWait(driver, 5, 0.5).until(
                                EC.presence_of_element_located(
                                    (By.CLASS_NAME, 'yidun_bg-img')))
                            ActionChains(driver).move_to_element(element)
                            time.sleep(0.5)
                            location_x = 0
                            location_y = 0
                            pyautogui.moveTo(locations['x'] + 25,
                                             int(locations['y'] + 96),
                                             duration=0.3)  #驱动鼠标操作,可以使用,只是看看
                            for location in locations_chaojiying:
                                pyautogui.moveRel(location[0] - location_x,
                                                  location[1] - location_y,
                                                  duration=0.6)
                                driver.execute(Command.MOVE_TO, {
                                    'xoffset': location[0],
                                    'yoffset': location[1]
                                })
                                print(" 点击坐标 " + str(location[0]),
                                      str(location[1]))
                                ActionChains(
                                    driver).move_to_element_with_offset(
                                        element, location[0],
                                        location[1] + 2).click().perform()
                                time.sleep(
                                    random.randint(1, 3) + random.random())
                                location_x = location[0]
                                location_y = location[1]
                        time.sleep(10)  #防止网页加载速度过慢拿不到公司名字
                        print('移动成功')
                        hem12 = driver.page_source
                        if hem12.find(f'{qyname}') != -1:
                            print('跳过验证码')
                            logs = [
                                json.loads(log['message'])['message']
                                for log in driver.get_log('performance')
                            ]
                            token = re.findall(
                                "accessToken': '(.*?)==', 'timeout': '30000'",
                                str(logs))[-1] + '=='
                            a21 = Mysql.seletoken(token=token)
                            if a21:
                                print('token已存在跳过')
                            else:
                                Mysql.insert_token(token=token, ip=ip)
                                a = 0
                            while True:
                                a12 = Mysql.jichutoken(yxq='0')
                                if a12:
                                    print('token获得成功暂停5秒钟', token)
                                    time.sleep(5)
                                else:
                                    driver.refresh()
                                    break
                        else:
                            print('验证失败或者没有这个公司重新尝试')
                            time.sleep(3)
                            break
                            # driver.refresh()
            elif he1.find(f'{qyname}') != -1:
                logs = [
                    json.loads(log['message'])['message']
                    for log in driver.get_log('performance')
                ]
                token = re.findall(
                    "accessToken': '(.*?)==', 'timeout': '30000'",
                    str(logs))[-1] + '=='
                a21 = Mysql.seletoken(token=token)
                if a21:
                    print('token已存在跳过')
                else:
                    Mysql.insert_token(token=token, ip=ip)
                    a = 0
                while True:
                    time.sleep(2)
                    a12 = Mysql.jichutoken(yxq='0')
                    if a12:
                        print('token获得成功暂停5秒钟', token)
                        time.sleep(5)
                    else:
                        driver.refresh()
                        break
            elif he1.find(f'{qyname}') == -1:
                print(f'2第{a}次刷新')
                driver.refresh()
                break
            else:
                time.sleep(1.5)
                driver.refresh()
                break
        except Exception as e:
            print(e)
            driver.quit()
            break
コード例 #22
0
ファイル: temp14.py プロジェクト: Yuan-zewei/jzsc
 def qyjcxx(self):
     try:
         a = Mysql.qiyexx()
         if a==None:
             print('当前没有数据可以爬取')
         else:
             print(a)
             self.z = a[1] # 企业名字
             self.qyid=a[0]
             # print(self.qyid)
             url = f'http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/comp/list?complexname={self.z}&pg=0&pgsz=15&total=0'  # 链接
             print(f'正在进行========================{self.z}========================================关键字信息的爬取')
             if url == None:  # 判断这个链接正不正确
                 print('信息不正确')
                 Mysql.gxqy(cx_state='0', gsname=self.z)
             else:
                 print('--------------------------------')
                 resp = requests.get(url=url, headers=self.headers,proxies=self.ip,timeout=15)
                 # print(resp.text)
                 if str(resp) == '<Response [200]>':
                     # print(resp.text)
                     asddd = self.jd_nx(data=f'{resp.text}')
                     assss=json.loads(asddd)
                     if assss['code']==200:
                         # print(assss['data']['list'])
                         qy_list=assss['data']
                         if len(qy_list['list'])!=0:
                             Mysql.insert_qy_list(eid=self.qyid,qyname=self.z, bh='1', qy_zt='1')  # 筛选出正确的企业id
                             print('放入数据成功')
                             qy_xinxi1=qy_list['list']
                             for qy_xinxi in qy_xinxi1:
                                 print(qy_xinxi,'-=-=-=-=-=-=-----------------------')
                                 qyid=qy_xinxi['QY_ID']
                                 QY_ORG_CODE=qy_xinxi['QY_ORG_CODE']
                                 QY_NAME=qy_xinxi['QY_NAME']
                                 print(qyid,QY_ORG_CODE,QY_NAME)
                                 dwid = Mysql.qiyexx_eid(qyname=QY_NAME)
                                 print(dwid)
                                 if dwid ==None:
                                     dwid=uuid4()
                                 else:
                                     dwid=dwid[0]
                                 Mysql.gxqy_fupa_te(zt='1', gsname=a[1])
                                 a = Mysql.selecttbl_qyname(eid=self.qyid)
                                 if a == None:
                                     print('正在插入tbl_fupa_temp表')
                                     Mysql.insetqyzt(eid=dwid,type='0', cx_val=QY_NAME, cx_state='1', qiyeurl=qyid, qyzzzt='1',
                                                     ryzt='1',
                                                     ryzyzc_zt='1', bh='1',qy_jcxx_zt='1')
                                     Mysql.update_qname_list(qy_zt=1, eid=dwid[0])
                                 else:
                                     print('数据库已经存在该公司!!')
                                     Mysql.update_qname_list(qy_zt=2, eid=dwid[0])
                         else:
                             # pass
                             print(f'没有{a[1]}这个公司')
                             Mysql.gxqy_fupa_te(zt='0',gsname=a[1])
                     else:
                         print('你的ip被封')
                         self.ip = self.ipz()
                         print('ip切换成功')
                 else:
                     asddd = self.jd_nx(data=f'{resp.text}')
                     assss=json.loads(asddd)
                     print('请求失败',assss)
                     self.ip = self.ipz()
                 # break
     except Exception as e:
         print(e)
         self.ip=self.ipz()
コード例 #23
0
def tulufan(name):
    global driver
    try:
        city = name
        print(f"{name}程序已启动,稍等几秒")
        # fz_excel(pro, city)  # 复制同款excel表格
        chromeOptions = webdriver.ChromeOptions()
        chromeOptions.add_experimental_option('w3c', False)
        chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation'])
        chromeOptions.add_argument('--headless')  # 隐藏浏览器
        driver = webdriver.Chrome(options=chromeOptions,executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
        driver.maximize_window()
        urls = {
            'http://www.tlf.gov.cn/ztlm/tlfxw.htm': 35,  # 人民政府  吐鲁番新闻
            'http://www.tlf.gov.cn/ztlm/gsggtz.htm': 19,  # 人民政府  公示公告通知
            'http://www.tlf.gov.cn/ztlm/xsdt.htm': 16,  # 人民政府  >县区动态
            'http://www.tlf.gov.cn/ztlm/bmdt.htm': 12,  # 人民政府  >部门动态
            'http://www.tlf.gov.cn/ztlm/jnwxw.htm': 21,  # 人民政府  疆内外新闻




            }
        for url, pages in zip(urls.keys(), urls.values()):
            driver.get(url)
            con = driver.page_source
            html_2 = etree.HTML(con)

            xpath="//table[@class='winstyle11251']/tbody/tr"
            length = len(html_2.xpath(xpath))
            po = 0
            for page in range(1, pages+1):
                con = driver.page_source
                html_1 = etree.HTML(con)
                if po > 0:
                    break
                for i in range(1, length):
                    if 'www' in url and i%5==0:
                        pass
                    else:
                        lengt = len(html_1.xpath(xpath))
                        xpath1 = xpath+f'[{i}]'

                        href = html_1.xpath(f"{xpath1}/td[2]/a/@href")[0].strip()
                        title = html_1.xpath(f"{xpath1}/td[2]/a//text()")[0].strip().replace('\n', '').replace('\t', '').replace(
                            '\r', '')
                        publictime = html_1.xpath(f"{xpath1}/td[3]/span/text()")[0].strip().replace('/', '-')

                        select = Mysql.select_xw_nr1(biaoti=title,dijishi=name)  # 查询标题是否存在

                        if select == None:
                            publictime_times = int(time.mktime(time.strptime(publictime, "%Y-%m-%d")))
                            # jiezhi_time = int(time.mktime(time.strptime('2018-01-01', "%Y-%m-%d")))
                            if publictime_times >= jiezhi_time:
                                if 'jxcq' in url:
                                    insertDBtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
                                    link = 'http://www.jxcq.org' + href
                                    uid = uuid.uuid4()
                                    Mysql.insert_xw_nr(prid=uid, shengfen=pro, dijishi=city, fabutime=publictime,
                                                       url=link,
                                                       biaoti=title, tianjiatime=insertDBtime, zt='0')
                                    print(f'--{city}-【{title}】写入成功')
                                else:
                                    chuli(publictime, href, driver, url, title, city,xpath1)
                            else:
                                po += 1
                                break
                        if i == lengt:
                                if lengt < length - 1:
                                    break
                                else:
                                    if page != pages:
                                        try:
                                            driver.find_element_by_xpath(f"//a[@class='default_pgBtn default_pgNext']").click()
                                        except:
                                            try:
                                                driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下页'))
                                            except:
                                                driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下一页'))
                                break
    except Exception as e:
        print('吐鲁番\t', e)
        driver.close()
        return tulufan(name)
コード例 #24
0
def wulumuqui(name):
    global driver
    try:
        city = name
        print(f"{name}程序已启动,稍等几秒")
        # fz_excel(pro, city)  # 复制同款excel表格
        chromeOptions = webdriver.ChromeOptions()
        chromeOptions.add_experimental_option('w3c', False)
        chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation'])
        chromeOptions.add_argument('--headless')  # 隐藏浏览器
        driver = webdriver.Chrome(options=chromeOptions,executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
        driver.maximize_window()
        urls = {
            'http://zwfw.xinjiang.gov.cn/xinjiangggzy/zwgk/002004/tradingCommon.html': 2,  # 公共资源中心  通知公告
            'http://www.urumqi.gov.cn/info/iList.jsp?cat_id=10005': 86,  # 人民政府  乌鲁木齐要闻
            'http://www.urumqi.gov.cn/info/iList.jsp?cat_id=12034': 59,  # 人民政府  自治区要闻
            'http://www.urumqi.gov.cn/info/iList.jsp?cat_id=12115': 61,  # 人民政府  通知公告
            'http://www.urumqi.gov.cn/info/iList.jsp?cat_id=10006': 2,  # 人民政府  政策解读
            }
        for url, pages in zip(urls.keys(), urls.values()):
            driver.get(url)
            con = driver.page_source
            html_2 = etree.HTML(con)
            if 'zwfw' in url:
                xpath="//div[@class='ewb-colu-bd']/div/ul/li/div"
                length = len(html_2.xpath(xpath)) + 2
                ii=2
            else:
                xpath = "//ul[@class='commonList_dot am-padding-top-sm am-padding-bottom-0 commonList_dot_Listnews']/li"
                length = len(html_2.xpath(xpath)) + 1
                ii = 1
            po = 0
            for page in range(1, pages+1):
                con = driver.page_source
                html_1 = etree.HTML(con)
                if po > 0:
                    break
                for i in range(ii, length):
                    if 'www' in url and i%6==0:
                        pass
                    else:

                        lengt = len(html_1.xpath(xpath))
                        xpath1 = xpath.replace('/ul/li', f'/ul/li[{i}]').replace(']/li', f']/li[{i}]')
                        if 'zwfw' in url:
                            href = html_1.xpath(f"{xpath1}/div/a/@href")[0].strip()
                            title = html_1.xpath(f"{xpath1}/div/a//text()")[0].strip().replace('\n', '').replace('\t', '').replace(
                                '\r', '')
                            publictime = html_1.xpath(xpath1+"/span/text()")[0].strip().replace('/', '-').replace('年', '-').replace('月', '-').replace('日', '')
                        else:
                            href = html_1.xpath(f"{xpath1}/a/@href")[0].strip()
                            title = html_1.xpath(f"{xpath1}/a//text()")[0].strip().replace('\n', '').replace('\t', '').replace(
                                '\r', '')
                            publictime = html_1.xpath(f"{xpath1}/span/text()")[0].strip().replace('/', '-')

                        select = Mysql.select_xw_nr1(biaoti=title,dijishi=name)  # 查询标题是否存在

                        if select == None:
                            publictime_times = int(time.mktime(time.strptime(publictime, "%Y-%m-%d")))
                            # jiezhi_time = int(time.mktime(time.strptime('2018-01-01', "%Y-%m-%d")))
                            if publictime_times >= jiezhi_time:
                                if 'jxcq' in url:
                                    insertDBtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
                                    link = 'http://www.jxcq.org' + href
                                    uid = uuid.uuid4()
                                    Mysql.insert_xw_nr(prid=uid, shengfen=pro, dijishi=city, fabutime=publictime,
                                                       url=link,
                                                       biaoti=title, tianjiatime=insertDBtime, zt='0')
                                    print(f'--{city}-【{title}】写入成功')
                                else:
                                    chuli(publictime, href, driver, url, title, city,xpath1)
                            else:
                                po += 1
                                break
                        if i == lengt:
                                if lengt < length - 1:
                                    break
                                else:
                                    if page != pages:
                                        try:
                                            driver.find_element_by_xpath(f"//a[@class='default_pgBtn default_pgNext']").click()
                                        except:
                                            try:
                                                driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下页'))
                                            except:
                                                driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下一页'))
                                break
    except Exception as e:
        print('乌鲁木齐\t', e)
        driver.close()
        return wulumuqui(name)
コード例 #25
0
def ganzhou():
    try:
        url1s=[
            'http://www.ganzhou.gov.cn/c100022/list.shtml',  # 政务动态
            'http://www.ganzhou.gov.cn/c100023/list.shtml',  # 通知公告
            # 'http://www.ganzhou.gov.cn/c100024/list_bmqx.shtml',  # 部门动态
            # 'http://www.ganzhou.gov.cn/c100025/list_bmqx.shtml',  # 区县动态
            # 'http://www.ganzhou.gov.cn/c100026/list.shtml',  # 便民提示
            # 'http://www.ganzhou.gov.cn/c100027/list.shtml',  # 央网推荐
            # 'http://www.ganzhou.gov.cn/c100028/list.shtml',  # 省网推荐
            # 'http://www.ganzhou.gov.cn/c100029/list.shtml',  # 市外媒体
            # 'http://www.ganzhou.gov.cn/c100030/list.shtml',  # 新闻发布会
            # 'http://www.ganzhou.gov.cn/c100032/list.shtml',  # 专题专栏
        ]
        for url1 in url1s:
            print("程序已启动,稍等几秒")

            for page in range(1,37):
                if page==1:
                    tt = requests.get(url1,proxies=ipmax()).content.decode('utf-8').replace('\n', '').replace('\r', '').replace('\t', '')
                else:
                    url2=url1.replace('list.shtml',f'list_{page}.shtml').replace('bmqx.shtml',f'bmqx_{page}.shtml')
                    tt = requests.get(url2).content.decode('utf-8').replace('\n', '').replace('\r', '').replace('\t',
                                                                                                            '')
                contents1 = re.findall('<div class="bd">(.*?)text/javascript', tt)
                contents = re.findall('<li><a href="(.*?)" target="_blank" title=\'(.*?)\'  >(.*?)</a><span>(.*?)</span>',contents1[0])
                for content in contents:
                    if re.findall('mp.weixin',content[0]):
                        linkurl=content[0]
                        # detail_res = requests.get(linkurl,proxies=ipmax()).content.decode('utf-8')
                        # Html = etree.HTML(detail_res)
                        # div = Html.xpath("//div[@id='page-content']")[0]
                        # infocontent = html.unescape(etree.tostring(div, method='html').decode()).replace("'", " ").replace( '"', ' ')
                    else:
                        linkurl = 'http://www.ganzhou.gov.cn' + content[0]
                        # detail_res = requests.get(linkurl,proxies=ipmax()).content.decode('utf-8')
                        # Html = etree.HTML(detail_res)
                        # div = Html.xpath('/html/body/div[4]')[0]
                        # infocontent = html.unescape(etree.tostring(div, method='html').decode()).replace("'", " ").replace('"',
                        #                                                                                                ' ')
                    title = content[1].replace(':',':')
                    publicTime = content[3]
                    s = publicTime.replace('/', '-')
                    t = int(datetime.strptime(s, '%Y-%m-%d').timestamp())
                    if t >= 1570896000:
                        select = Mysql.select_xinwen(title=title)  # 查询标题是否存在
                        if len(select) == 0:
                            uid = uuid.uuid4()
                            Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='360000', regionName='江西省', areaRegion='赣州市',
                                                        publicTime=publicTime, linkurl=linkurl, title=title,
                                                        dataResource='', yewuType='人民政府', infoType='', infoState='', isok='',
                                                        isdeal='')
                            Mysql.insert_xinwen_detailinfo(uid=uid, infocontent='')
                            print(f'标题【{title}】写入成功')

                        else:
                            print(f'标题【{title}】存在')
                    else:
                        break
                print('-' * 50 + f'赣州市第{page}页已完成' + '-' * 50)

    #         chromeOptions = webdriver.ChromeOptions()
    #         chromeOptions.add_experimental_option('w3c', False)
    #         chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation'])
    #         chromeOptions.add_argument('--headless')  # 隐藏浏览器
    #         # chromeOptions.add_argument(f'--proxy-server={ipmax()}')
    #         driver = webdriver.Chrome(options=chromeOptions,  executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
    #         driver.get(url=url1)
    #         aoo_11 = driver.page_source  # html
    #         pages=re.findall('总共(\d+)页',aoo_11)
    #         print(f'共{pages[0]}页')
    #         for aa in range(1, int(pages[0])):
    #             if driver.find_element_by_xpath("//td[@class='font_hei12']/input[@id='CP']"):
    #                 driver.find_element_by_xpath("//td[@class='font_hei12']/input[@id='CP']").clear()  # 清除文本框内容
    #             else:
    #                 driver.find_element_by_xpath("//input[@id='ctl00$ContentPlaceHolder1$AspNetPager1_input']").clear()  # 清除文本框内容
    #             driver.find_element_by_xpath("//tr[3]/td[@class='font_hei12']/input[@id='CP']").send_keys(aa)  # 搜索框输入内容
    #             driver.find_element_by_xpath("//tr[3]/td[@class='font_hei12']/input[2]").click()  # 点击一下按钮
    #
    #             aoo_1 = driver.page_source  # html
    #             html_1 = etree.HTML(aoo_1)
    #             list_num = html_1.xpath(f"//table/tbody/tr[1]/td[@class='font_hei14']/a")  # 详情url
    #             for i in range(1, len(list_num)+1):  # 一页20条数据
    #                 qufen ='人民政府'+html_1.xpath(f"/html/body/table[3]/tbody/tr/td[3]/table/tbody/tr[1]/td/a[4]/text()")[0].strip()  # 区分
    #                 link = html_1.xpath(f"//table[{i}]/tbody/tr[1]/td[@class='font_hei14']/a/@href")[0].strip()  # 详情url
    #                 title = html_1.xpath(f"//table[{i}]/tbody/tr[1]/td[@class='font_hei14']/a/text()")[0].strip()  # 标题
    #                 publicTime = html_1.xpath(f"//tr[2]/td[@class='borderhui']/table[{i}]/tbody/tr[1]/td[@class='font_hui12']/text()")[0].strip().replace('\n','') .replace('[','') .replace(']','') .replace('                ','')  # 时间
    #                 s = publicTime.replace('/', '-')
    #                 t = int(datetime.strptime(s, '%Y-%m-%d').timestamp())
    #                 if t >= 1570896000:
    #
    #                     if re.findall('xinhuan',link):
    #                         linkurl=link
    #                     else:
    #                         linkurl = url1 + link[1:]  # url
    #                     driver.find_element_by_xpath(f"//table[{i}]/tbody/tr[1]/td[@class='font_hei14']/a").click()
    #                     driver.switch_to.window(driver.window_handles[-1])
    #                     detail_res=driver.page_source
    #                     Html = etree.HTML(detail_res)
    #                     if Html.xpath("//table[3]/tbody/tr/td[@class='font_hui12']"):
    #                         div1 = Html.xpath("//table[3]/tbody/tr/td[@class='font_hui12']")[0]  # 当前栏目
    #                         div2 = Html.xpath("//table[@class='borderhui']/tbody/tr/td")[0]       # text
    #                     elif Html.xpath("//div/div/div[@class='news-position']"):
    #                         div1 = Html.xpath("//div/div/div[@class='news-position']")[0]  # 当前栏目
    #                         div2 = Html.xpath("//div/div/div[@id='p-detail']")[0]  # text
    #                     elif Html.xpath("//div[@class='padd']/div[@class='BreadcrumbNav']"):
    #                         div1 = Html.xpath("//div[@class='padd']/div[@class='BreadcrumbNav']")[0]  # 当前栏目
    #                         div2 = Html.xpath("//div[@class='article oneColumn pub_border']")[0]  # text
    #                     else:
    #                         div1 = Html.xpath("//div[@class='xl-main']/div[@class='container']")[0]  # 当前栏目
    #                         div2 = '' # text
    #                     try:
    #                         infocontent1 = html.unescape(etree.tostring(div1, method='html').decode()).replace("'", " ").replace(
    #                             '"', ' ')  # html
    #                         infocontent2 = html.unescape(etree.tostring(div2, method='html').decode()).replace("'", " ").replace(
    #                                             '"', ' ')  # html
    #                         infocontent=infocontent1+infocontent2
    #                     except:
    #                         infocontent1 = html.unescape(etree.tostring(div1, method='html').decode()).replace("'",  " ").replace( '"', ' ')  # html
    #                         infocontent=infocontent1
    #                     if re.findall('src="(.*?)" oldsrc=',infocontent):
    #                         infocontent=infocontent.replace('src=.\./',url1+link[1:7]+'/')
    #                     else:infocontent=infocontent
    #                     select = Mysql.select_xinwen(title=title)  # 查询标题是否存在
    #                     if len(select)==0:
    #                         uid = uuid.uuid4()
    #                         Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='360000', regionName='江西省', areaRegion='萍乡市',
    #                                             publicTime=publicTime, linkurl=linkurl, title=title,
    #                                             dataResource='', yewuType='人民政府', infoType='', infoState='', isok='',
    #                                             isdeal='')
    #                         Mysql.insert_xinwen_detailinfo(uid=uid, infocontent=infocontent)
    #                         print(f'标题【{title}】写入成功')
    #
    #                     else:
    #                         print(f'标题【{title}】存在')
    #                     driver.back()  # 返回上一页
    #                     time.sleep(1)
    #             print('-' * 50 + f'萍乡第{aa}页已完成' + '-' * 50)
    except Exception as e:
        print('蚌埠\t', e)
        return ganzhou()
コード例 #26
0
def gcxm_jcxx(resp, qyid):
    print('--该工程项目的基础信息--')
    try:
        xmid = resp['PRJNUM']

        addr = resp['ADDRESS']  #具体地点
        print(addr)
        if addr == None:
            address = ''
        else:
            address = resp['ADDRESS']

        zjb = resp['NATIONALPERCENTTAGE']  #国有资金出资比例
        if zjb == None:
            zjbl = ''
        else:
            zjbl = resp['NATIONALPERCENTTAGE']

        zj = resp['FUNDSOURCE']  # 资金来源
        if zj == None:
            zjly = ''
        else:
            zjly = resp['FUNDSOURCE']

        jsyd = resp['BUILDPLANNUM']  #建设用地规划许可证编号
        if jsyd == None:
            jsydxkzbh = ''
        else:
            jsydxkzbh = resp['BUILDPLANNUM']

        jscg = resp['PROJECTPLANNUM']  #建设工程规划许可证编号
        if jscg == None:
            jscgghxkzbh = ''
        else:
            jscgghxkzbh = resp['PROJECTPLANNUM']

        jhk = resp['BEGINDATE']  # 计划开工
        if jhk == None:
            jhkg = ''
        else:
            jhkg = time_s(resp['BEGINDATE'])
        jh = resp['ENDDATE']  # 计划竣工
        if jh == None:
            jhjg = ''
        else:
            jhjg = time_s(resp['ENDDATE'])
        j = resp['PRJSIZE']  #建设规模
        if j == None:
            jsgm = ''
        else:
            jsgm = resp['PRJSIZE']

        if resp['DATASOURCE'] == None:
            sjly = ''
        else:
            sjly = resp['DATASOURCE']  # 数据来源

        if resp['DATALEVEL'] == None:
            sjdj = ''
        else:
            sjdj = resp['DATALEVEL']  # 数据等级

            #重点项目
        if resp['IS_FAKE'] == None:
            zdxm = ''
        elif resp['IS_FAKE'] == 0:
            zdxm = '否'
        else:
            zdxm = '是'

        if resp['PRJAPPROVALDATE'] == None:
            lxpfsj = ''
        else:
            lxpfsj = resp['PRJAPPROVALDATE']  # 立项批复时间

        if resp['PRJAPPROVALDEPART'] == None:  # 立项批复机关
            lxpfjg = ''
        else:
            lxpfjg = resp['PRJAPPROVALDEPART']

        print('\t具体地点:', address, '\t国有资金出资比例:', zjbl, '\t资金来源:', zjly,
              '\t建设用地规划许可证编号:', jsydxkzbh, '\t建设工程规划许可证编号:', jscgghxkzbh,
              '\t计划开工:', jhkg, '\t计划开工:', jhjg, '\t计划竣工:', jsgm, '\t建设规模:',
              sjly, '\t数据来源:', sjly, '\t数据等级:', sjdj, '\t重点项目:', zdxm,
              '\t立项批复时间:', lxpfsj, '\t立项批复机关:', lxpfjg)
        if Mysql.selecttbl_qy_xm_jcxx(qyid=qyid, xmid=xmid):
            Mysql.updatetbl_qy_xm_jcxx(qyid=qyid,
                                       xmid=xmid,
                                       address=address,
                                       zjbl=zjbl,
                                       zjly=zjly,
                                       jsydxkzbh=jsydxkzbh,
                                       jscgghxkzbh=jscgghxkzbh,
                                       jhkg=jhkg,
                                       jhjg=jhjg,
                                       jsgm=jsgm,
                                       sjly=sjly,
                                       sjdj=sjdj,
                                       zdxm=zdxm,
                                       lxpfsj=lxpfsj,
                                       lxpfjg=lxpfjg)

        else:
            Mysql.inserttbl_qy_xm_jcxx(qyid=qyid,
                                       xmid=xmid,
                                       address=address,
                                       zjbl=zjbl,
                                       zjly=zjly,
                                       jsydxkzbh=jsydxkzbh,
                                       jscgghxkzbh=jscgghxkzbh,
                                       jhkg=jhkg,
                                       jhjg=jhjg,
                                       jsgm=jsgm,
                                       sjly=sjly,
                                       sjdj=sjdj,
                                       zdxm=zdxm,
                                       lxpfsj=lxpfsj,
                                       lxpfjg=lxpfjg)
        print(f'         该项目基础信息插入完成')
    except Exception as e:
        print(e)
コード例 #27
0
def gcxm_weizhi(resp, qyid):
    ID = resp['ID']
    url = 'http://jzsc.mohurd.gov.cn/data/project/detail?id=' + ID
    print(f'--工程项目基础信息中未知的字段--\n其相关链接:{url}')
    try:
        xmid = resp['PRJNUM']
        yy = resp['PRJCODE']
        if yy == None:
            y = ''
        else:
            y = resp['PRJCODE']
        aa = resp['LOCATIONX']
        if aa == None:
            a = ''
        else:
            a = resp['LOCATIONX']
        bb = resp['LOCATIONY']
        if bb == None:
            b = ''
        else:
            b = resp['LOCATIONY']
        cc = resp['ALLLENGTH']
        if cc == None:
            c = ''
        else:
            c = resp['ALLLENGTH']
        dd = resp['ISMAJOR']
        if dd == None:
            d = ''
        else:
            d = resp['ISMAJOR']
        ee = resp['JZJNINFO']
        if ee == None:
            e = ''
        else:
            e = resp['JZJNINFO']
        ff = resp['INVPROPERTYNUM']
        if ff == None:
            f = ''
        else:
            f = resp['INVPROPERTYNUM']
        gg = resp['INVPROPERTY']
        if gg == None:
            g = ''
        else:
            g = time_s(resp['INVPROPERTY'])
        hh = resp['WANDAOLEE_ROWGUID']
        if hh == None:
            h = ''
        else:
            h = time_s(resp['WANDAOLEE_ROWGUID'])
        ii = resp['SORTNUM']
        if ii == None:
            i = ''
        else:
            i = resp['SORTNUM']
        jj = resp['PREFIX']
        if jj == None:
            j = ''
        else:
            j = resp['PREFIX']
        kk = resp['JSBJGSIGN']
        if kk == None:
            k = ''
        else:
            k = resp['JSBJGSIGN']
        ll = resp['PKID']
        if ll == None:
            l = ''
        else:
            l = resp['PKID']
        mm = resp['CXXMINFO']
        if mm == None:
            m = ''
        else:
            m = resp['CXXMINFO']
        nn = resp['CHECKDEPARTNAME']
        if nn == None:
            n = ''
        else:
            n = resp['CHECKDEPARTNAME']
        oo = resp['PRJTWODIMCODE']
        if oo == None:
            o = ''
        else:
            o = resp['PRJTWODIMCODE']
        pp = resp['PRJAPPROVALDEPART']
        if pp == None:
            p = ''
        else:
            p = resp['PRJAPPROVALDEPART']
        qq = resp['CHECKDEPARTNAME']
        if qq == None:
            q = ''
        else:
            q = resp['CHECKDEPARTNAME']
        ss = resp['PRJAPPROVALDATE']
        if ss == None:
            s = ''
        else:
            s = resp['PRJAPPROVALDATE']
        tt = resp['MARK']
        if tt == None:
            t = ''
        else:
            t = resp['MARK']
        uu = resp['FAKE_CORP_NAME']
        if uu == None:
            u = ''
        else:
            u = resp['FAKE_CORP_NAME']
        vv = resp['FAKE_CORP_ID']
        if vv == None:
            v = ''
        else:
            v = resp['FAKE_CORP_ID']

        print(
            f'LOCATIONX:{a},LOCATIONY:{b},ALLLENGTH:{c},ISMAJOR:{d},JZJNINFO:{e},INVPROPERTYNUM:{f},INVPROPERTY:{g},WANDAOLEE_ROWGUID:{h},SORTNUM:{i},PREFIX:{j},JSBJGSIGN:{k},PKID:{l},CXXMINFO:{m},CHECKDEPARTNAME:{n},PRJTWODIMCODE:{o},PRJAPPROVALDEPART:{p},CHECKDEPARTNAME:{q},PRJAPPROVALDATE:{s},MARK:{t},FAKE_CORP_NAME:{u},prjcode:{v}'
        )
        if Mysql.selecttbl_qy_xm_weizhi(qyid=qyid, xmid=xmid):
            Mysql.updatetbl_qy_xm_weizhi(qyid=qyid,
                                         xmid=xmid,
                                         locationx=a,
                                         locationy=b,
                                         alllength=c,
                                         ismajor=d,
                                         jzjninfo=e,
                                         invpropertynum=f,
                                         invproperty=g,
                                         wandaolee_roeguid=h,
                                         sortnum=i,
                                         prefix=j,
                                         jsbjgsign=k,
                                         pkid=l,
                                         cxxninfo=m,
                                         checkdepariname=n,
                                         prjtwodimcode=o,
                                         prjapprovaldepart=p,
                                         checkdepartname=q,
                                         prjapprovaldate=s,
                                         mark=t,
                                         fake_corp_name=u,
                                         fake_corp_id=v,
                                         prjcode=y)

        else:
            Mysql.inserttbl_qy_xm_weizhi(qyid=qyid,
                                         xmid=xmid,
                                         locationx=a,
                                         locationy=b,
                                         alllength=c,
                                         ismajor=d,
                                         jzjninfo=e,
                                         invpropertynum=f,
                                         invproperty=g,
                                         wandaolee_roeguid=h,
                                         sortnum=i,
                                         prefix=j,
                                         jsbjgsign=k,
                                         pkid=l,
                                         cxxninfo=m,
                                         checkdepariname=n,
                                         prjtwodimcode=o,
                                         prjapprovaldepart=p,
                                         checkdepartname=q,
                                         prjapprovaldate=s,
                                         mark=t,
                                         fake_corp_name=u,
                                         fake_corp_id=v,
                                         prjcode=y)
        print(f'         该项目未知信息插入完成')

    except Exception as e:
        print(e)
コード例 #28
0
def aletai(name):
    global driver
    try:
        city = name
        print(f"{name}程序已启动,稍等几秒")
        # fz_excel(pro, city)  # 复制同款excel表格
        chromeOptions = webdriver.ChromeOptions()
        chromeOptions.add_experimental_option('w3c', False)
        chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation'])
        chromeOptions.add_argument('--headless')  # 隐藏浏览器
        driver = webdriver.Chrome(options=chromeOptions,executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
        driver.maximize_window()
        urls = {
            'http://www.alt.gov.cn/zwxx/001003/listPage.html': 436,  # 人民政府  自治区要闻
            'http://www.alt.gov.cn/zwxx/001001/listPage.html': 47,  # 人民政府  政务动态
            'http://www.alt.gov.cn/zwxx/001004/listPage.html': 20,  # 人民政府  乡镇场动态
            'http://www.alt.gov.cn/zwxx/001005/listPage.html': 32,  # 人民政府  部门动态
            'http://www.alt.gov.cn/zwxx/001006/listPage.html': 5,  # 人民政府  公示公告

            }
        for url, pages in zip(urls.keys(), urls.values()):
            driver.get(url)
            con = driver.page_source
            html_2 = etree.HTML(con)

            xpath = "//div[@class='ewb-pl20']/ul/li"
            length = len(html_2.xpath(xpath))
            po = 0
            for page in range(1, pages+1):
                con = driver.page_source
                html_1 = etree.HTML(con)
                if po > 0:
                    break
                for i in range(1, length):
                    # if 'www' in url and i%5==0:
                    #     pass
                    # else:
                        lengt = len(html_1.xpath(xpath))
                        xpath1 = xpath.replace('/ul/li', f'/ul/li[{i}]')
                        href = html_1.xpath(f"{xpath1}/a/@href")[0].strip()
                        title = html_1.xpath(f"{xpath1}/a//text()")[0].strip().replace('\n', '').replace('\t', '').replace(
                            '\r', '')
                        publictime = html_1.xpath(f"{xpath1}/span/text()")[0].strip().replace('/', '-')

                        select = Mysql.select_xw_nr1(biaoti=title,dijishi=name)  # 查询标题是否存在

                        if select == None:
                            publictime_times = int(time.mktime(time.strptime(publictime, "%Y-%m-%d")))
                            # jiezhi_time = int(time.mktime(time.strptime('2018-01-01', "%Y-%m-%d")))
                            if publictime_times >= jiezhi_time:
                                if 'jxcq' in url:
                                    insertDBtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
                                    link = 'http://www.jxcq.org' + href
                                    uid = uuid.uuid4()
                                    Mysql.insert_xw_nr(prid=uid, shengfen=pro, dijishi=city, fabutime=publictime,
                                                       url=link,
                                                       biaoti=title, tianjiatime=insertDBtime, zt='0')
                                    print(f'--{city}-【{title}】写入成功')
                                else:
                                    chuli(publictime, href, driver, url, title, city,xpath1)
                            else:
                                po += 1
                                break
                        if i == lengt:
                                if lengt < length - 1:
                                    break
                                else:
                                    if page != pages:
                                        try:
                                            driver.find_element_by_xpath(f"//a[@class='default_pgBtn default_pgNext']").click()
                                        except:
                                            try:
                                                driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下页'))
                                            except:
                                                driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下一页'))
                                break
    except Exception as e:
        print('阿勒泰\t', e)
        driver.close()
        return aletai(name)
コード例 #29
0
def hubei(name):
    global driver
    try:
        city = name
        print(f"{name}程序已启动,稍等几秒")
        # fz_excel(pro, city)  # 复制同款excel表格
        chromeOptions = webdriver.ChromeOptions()
        chromeOptions.add_experimental_option('w3c', False)
        chromeOptions.add_experimental_option('excludeSwitches',
                                              ['enable-automation'])
        chromeOptions.add_argument('--headless')  # 隐藏浏览器
        driver = webdriver.Chrome(
            options=chromeOptions,
            executable_path=
            'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe'
        )
        driver.maximize_window()
        url = 'http://220.160.52.164:96/ConstructionInfoPublish/Pages/CompanyQuery.aspx?systemID=31'

        urls = {
            '39': '建筑业|1863',
            '31': '省外建筑业|555',
            '9': '招标代理|149',
            '42': '省外招标代理|25',
            '18': '一体化|18',
        }

        driver.get(url)
        for value, zzlxx in zip(urls.keys(), urls.values()):
            zzlx = zzlxx.split('|')[0]
            pages = int(zzlxx.split('|')[1])
            s1 = Select(
                driver.find_element_by_id(
                    'ctl00_ContentPlaceHolder_ddlBussinessSystem')
            )  # 实例化Select
            s1.select_by_value(value)  #
            con = driver.page_source
            html_2 = etree.HTML(con)
            xpath = "//table[@id='ctl00_ContentPlaceHolder_gvDemandCompany']/tbody/tr/td[1]/a"

            length = len(html_2.xpath(xpath)) + 2
            po = 0
            cc = 10
            for page in range(1, pages + 1):
                con = driver.page_source
                html_1 = etree.HTML(con)
                if po > 0:
                    break
                for i in range(2, length):
                    lengt = len(html_1.xpath(xpath)) + 1
                    xpath1 = xpath.replace('tr/td[', f'tr[{i}]/td[')

                    qyurl = 'http://220.160.52.164:96/ConstructionInfoPublish/Pages/' + html_1.xpath(
                        f"{xpath1}/@href")[0].strip()
                    qyname = html_1.xpath(
                        f"{xpath1}/text()")[0].strip().replace(
                            '\n', '').replace('\t',
                                              '').replace('\r', '').replace(
                                                  '(', ')').replace(')', ')')
                    shxydm = html_1.xpath(
                        f"{xpath1.replace('[1]/a','[6]')}/text()")[0].strip(
                        ).replace('\n', '')

                    select = Mysql.select_fj(qyname=qyname,
                                             qyurl=qyurl)  # 查询标题是否存在

                    if select == None:
                        Mysql.insert_fj(qyname=qyname,
                                        shxydm=shxydm,
                                        qyurl=qyurl,
                                        zzlx=zzlx)

                    if i == lengt:
                        if lengt < length - 1:
                            break
                        else:
                            if page != pages:
                                if page > pages - 5:
                                    driver.find_element_by_xpath(
                                        f"//div[@id='ctl00_ContentPlaceHolder_pGrid']/table/tbody/tr/td[{cc}]/a"
                                    ).click()
                                    cc += 1
                                else:
                                    driver.find_element_by_xpath(
                                        f"//a[@id='ctl00_ContentPlaceHolder_pGrid_nextpagebtn']"
                                    ).click()

                        break
    except Exception as e:
        print('湖北\t', e)
        driver.close()
        return hubei(name)
コード例 #30
0
ファイル: hq_token.py プロジェクト: Yuan-zewei/jzsc
                driver.refresh()
                break
        except Exception as e:
            print(e)
            driver.quit()
            break


def ipz():
    # 设置代理连接
    while True:
        resp = requests.get('代理连接').text
        if resp.find('data') != -1:
            resp1 = json.loads(resp)['data']
            http = str(resp1[0]["ip"]) + ":" + str(resp1[0]["port"])
            return http
        else:
            time.sleep(5)


while True:
    a = Mysql.qiyexx_url(bh='1')
    for x in a:
        try:
            qyid = x[0]  # 公司eid
            z = x[2]  # 公司名字
            qyid1 = x[3]  #qyid
            qw = gx_qyid(z=z, eid=qyid)  #这个东西可以优化,在失败或者加载不出东西可以尝试更新,不用每次加载
            selenu(qw, z, ipz())
        except Exception as E:
            print(E)