def getMainHTML(target,mf_tpe): # session=requests.session() params={"currentPage":1,"type":"2"} res = requests.get(target, params=params, headers=random.sample(headers, 1)[0]); # cookie=res.cookies; soup = BeautifulSoup(res.content, 'lxml') trs = None page=soup.find('div',class_='paging').find_all('span') recordCount=re.sub('\D',"",page[1].get_text()) recordCount=recordCount[1:recordCount.__len__()] for i in range(358,int(recordCount)+1): print("爬取第 %d 页的数据"%(i) ) params={"currentPage":i,"type":"2"} res = requests.get(target, params=params, headers=random.sample(headers, 1)[0]); if(res.status_code==200 and res.text.find("400 Bad Request")==-1): soup = BeautifulSoup(res.content, 'lxml') trs = soup.find('table').find_all('tr') else: logging.error("爬行目标 %s 出现解析错误"%(target)) continue ret =None info=[] mf=None if(trs): for index,tr in enumerate(trs): if index==0: continue else: a=tr.find("a") url=a["href"] txt=a.get_text() ret=db_kit.findMFOnByUrl(url,mf_tpe); if ret is None : info=getDetailHtml(url) mf=db_kit.Mf() mf.type=mf_tpe mf.catalog=str(info[0]).replace("\r\n","").replace("\t","").strip() mf.reg_time=info[1] mf.reg_org=info[2] mf.reg_num=info[3] mf.legal=info[4] mf.mng_unit=str(info[5]).replace("\r\n","").replace("\t","").strip() mf.expiry_date=str(info[6]).replace("\r\n","").replace("\t","").strip() mf.scope=info[7] mf.ads=info[8] mf.zip_code=info[9] mf.tel=info[10] mf.phone=info[11] mf.url=url mf.reg_name=str(txt).replace("\r\n","").replace("\t","").strip() db_kit.insert(mf) time.sleep(0.5) elif ret and not ret.reg_name: print("执行了更新操作") ret.reg_name = txt db_kit.update(ret) else: print("数据已经存在, %s "%(ret.reg_name)) send_email.sendEmail("程序完成,快去查看")
def WAndRRbcc(): workbook = xlrd.open_workbook('/Users/yuhaihui8913/Documents/wh/人保财产.xls') sheet = workbook.sheet_by_index(1) obj = None fCode = '' fName = '' sCode = '' header = '' lastF = '' insurance='iorbcc' for i in range(1, 699): if (str(sheet.cell(i, 0).value).strip() != ''): fCode = (str(sheet.cell(i, 0).value).strip())[0:2] fName = (str(sheet.cell(i, 0).value).strip())[2:] if fCode != lastF: header = '' obj = INOCC() obj.code = fCode.replace(' ','') obj.name = fName.replace(' ','') obj.insurance = insurance # if db_kit.existCheck(obj.code, insurance) == 'no': db_kit.insert(obj) printObj(obj) if (str(sheet.cell(i, 1).value).strip() != ''): sCode = (str(sheet.cell(i, 1).value).strip())[0:4] sName = (str(sheet.cell(i, 1).value).strip())[4:] obj = INOCC() obj.pCode = fCode obj.code = sCode.replace(' ','') obj.name = sName.replace(' ','') obj.insurance = insurance # if db_kit.existCheck(obj.code, insurance) == 'no': db_kit.insert(obj) printObj(obj) if (str(sheet.cell(i, 2).value).strip() != '' and (str(sheet.cell(i, 2).value).startswith('0') or str(sheet.cell(i, 2).value).startswith('1') or str(sheet.cell(i, 2).value).startswith('2'))): obj = INOCC() obj.name = header + '-' + (str(sheet.cell(i, 2).value).strip())[6:] if header != '' else (str(sheet.cell(i, 2).value).strip())[6:] obj.name = obj.name.replace(' ','') obj.code = (str(sheet.cell(i, 2).value).strip())[0:6] obj.code=obj.code.replace(' ','') obj.insurance = insurance obj.pCode = sCode if sheet.cell(i, 3).ctype == 2: obj.type = int(sheet.cell(i, 3).value) if str(sheet.cell(i, 3).value).strip() != '' else '' else: obj.type = str(sheet.cell(i, 3).value).strip() db_kit.insert(obj) printObj(obj) elif str(sheet.cell(i, 2).value).strip().startswith('注:'): continue else: header = str(sheet.cell(i, 2).value).strip() lastF = fCode print(str(i))
def WAndRHt(): workbook = xlrd.open_workbook('/Users/yuhaihui8913/Documents/wh/华泰职业类别表.xls') sheet = workbook.sheet_by_index(0) obj = None fCode = '' fName = '' sCode = '' header = '' lastF = '' insurance='ioht' for i in range(2, 998): if (str(sheet.cell(i, 0).value).strip() != ''): fCode = str(sheet.cell(i, 0).value).strip() fName = (str(sheet.cell(i, 1).value).strip())[2:] if fCode != lastF: header = '' obj = INOCC() obj.code = fCode obj.name = fName obj.insurance = insurance # if db_kit.existCheck(obj.code, insurance) == 'no': db_kit.insert(obj) printObj(obj) if (str(sheet.cell(i, 2).value).strip() != ''): sCode = str(sheet.cell(i, 2).value).strip() sName = (str(sheet.cell(i, 3).value).strip())[4:] obj = INOCC() obj.pCode = fCode obj.code = sCode obj.name = sName obj.insurance = insurance # if db_kit.existCheck(obj.code, insurance) == 'no': db_kit.insert(obj) printObj(obj) if (str(sheet.cell(i, 4).value).strip() != ''): obj = INOCC() obj.name = header + '-' + str(sheet.cell(i, 5).value) if header != '' else str(sheet.cell(i, 5).value) obj.name = obj.name.strip() obj.code = str(sheet.cell(i, 4).value).strip() obj.insurance = insurance obj.pCode = sCode if sheet.cell(i, 6).ctype == 2: obj.type = int(sheet.cell(i, 6).value) if str(sheet.cell(i, 6).value).strip() != '' else '' else: obj.type = str(sheet.cell(i, 6).value).strip() db_kit.insert(obj) printObj(obj) elif str(sheet.cell(i, 5).value).strip().startswith('注:'): continue else: header = str(sheet.cell(i, 5).value) lastF = fCode
def RAndWPAYL(file,bRow,eRow,insurance): workbook = xlrd.open_workbook(file) sheet = workbook.sheet_by_index(1) obj = None fCode = '' fName = '' sCode = '' header = '' lastF = '' for i in range(bRow, eRow): if (str(sheet.cell(i, 0).value).strip() != ''): fCode = str(sheet.cell(i, 0).value).strip()[0:2] fName = str(sheet.cell(i, 0).value).strip()[2:] if fCode!=lastF: header='' obj = INOCC() obj.code = fCode obj.name = fName obj.insurance = insurance # if db_kit.existCheck(obj.code, insurance) == 'no': db_kit.insert(obj) printObj(obj) if (str(sheet.cell(i, 1).value).strip() != ''): sCode = str(sheet.cell(i, 1).value).strip()[0:4] sName = str(sheet.cell(i, 1).value).strip()[4:] obj = INOCC() obj.pCode = fCode obj.code = sCode obj.name = sName obj.insurance = insurance # if db_kit.existCheck(obj.code, insurance) == 'no': db_kit.insert(obj) printObj(obj) if (str(sheet.cell(i, 2).value).strip() != ''): obj = INOCC() obj.name = header + '-' + str(sheet.cell(i, 3).value) if header != '' else str(sheet.cell(i, 3).value) obj.name = obj.name.strip() obj.code = str(sheet.cell(i, 2).value).strip() obj.insurance = insurance obj.pCode = sCode if sheet.cell(i,4).ctype==2: obj.type = int(sheet.cell(i, 4).value) if str(sheet.cell(i, 4).value).strip() != '' else '' else: obj.type = str(sheet.cell(i, 4).value) db_kit.insert(obj) printObj(obj) elif str(sheet.cell(i, 3).value).strip().startswith('注:'): continue else: header = str(sheet.cell(i, 3).value) lastF = fCode
def RAndWXls(file,col,bRow,eRow,insurance,hasCode=True): workbook = xlrd.open_workbook(file) sheet = workbook.sheet_by_index(1) obj = INOCC() fCode='' if hasCode: f = sheet.cell(bRow, col - 4).value obj.insurance = insurance obj.name = f obj.code=sheet.cell(bRow,col-5).value fCode=obj.code else: f = str(sheet.cell(bRow, col - 2).value) obj.insurance = insurance fa=f.split(" ") obj.name = fa[-1] obj.code = fa[0] fCode=obj.code print('\n'.join(['%s:%s' % item for item in obj.__dict__.items()])) db_kit.insert(obj) g='' gCode=''; fl='' for i in range(bRow,eRow): obj=INOCC() if sheet.cell(i,col-1).value!='': fl='' g=str(sheet.cell(i,col-1).value) ga=g.split(" ") if hasCode : obj.insurance = insurance obj.name = g obj.pCode=fCode obj.code = sheet.cell(i, col - 3).value gCode=obj.code else: obj.insurance = insurance obj.name = ga[-1] obj.pCode = fCode obj.code = ga[0] gCode = obj.code db_kit.insert(obj) print('\n'.join(['%s:%s' % item for item in obj.__dict__.items()])) obj=INOCC() obj.pCode=gCode if hasCode: obj.code=sheet.cell(i,col-1).value obj.name=sheet.cell(i,col).value if sheet.cell(i,col+1).ctype==2: obj.type=int(sheet.cell(i,col+1).value) else: obj.type = str(sheet.cell(i, col + 1).value) else: d=str(sheet.cell(i, col).value) if d.startswith("注"): continue da=d.split(" ") if len(da) == 1 : fl=da[0] continue obj.code = da[0] obj.name = da[-1] if fl=='' else fl+'-'+da[-1] if sheet.cell(i, col + 1).ctype==2: obj.type = int(sheet.cell(i, col + 1).value) else: obj.type = str(sheet.cell(i, col + 1).value) obj.insurance=insurance print ('\n'.join(['%s:%s' % item for item in obj.__dict__.items()])) db_kit.insert(obj)
def WAndRRbjk(): workbook = xlrd.open_workbook('/Users/yuhaihui8913/Documents/wh/人保健康职业类别表.xlsx') sheet = workbook.sheet_by_index(0) obj = None fCode = '' fName = '' sCode = '' header = '' lastF = '' insurance='iorbjk' for i in range(1, 325): if (str(sheet.cell(i, 0).value).strip() != ''): fCode = (int(sheet.cell(i, 0).value)) fName = (str(sheet.cell(i, 1).value).strip()) if fCode != lastF: header = '' obj = INOCC() obj.code = fCode obj.name = fName.replace(' ','') obj.insurance = insurance # if db_kit.existCheck(obj.code, insurance) == 'no': db_kit.insert(obj) printObj(obj) if (str(sheet.cell(i, 2).value).strip() != ''): sCode = (int(sheet.cell(i, 2).value)) sName = (str(sheet.cell(i, 3).value).strip()) obj = INOCC() obj.pCode = fCode obj.code = sCode obj.name = sName.replace(' ','') obj.insurance = insurance # if db_kit.existCheck(obj.code, insurance) == 'no': db_kit.insert(obj) printObj(obj) if str(sheet.cell(i, 4).value).strip() != '' : obj = INOCC() s=str(sheet.cell(i,4).value).replace(' ','') if s.startswith('ns'): obj.name = header + '-' + (str(sheet.cell(i, 4).value).strip()) if header != '' else (str(sheet.cell(i, 4).value).strip()) obj.name = obj.name.replace(' ','') obj.code = str(sCode)+'00' obj.code=obj.code.replace(' ','') obj.insurance = insurance obj.pCode = sCode if sheet.cell(i, 5).ctype == 2: obj.type = int(sheet.cell(i, 5).value) if str(sheet.cell(i, 5).value).strip() != '' else '' else: obj.type = str(sheet.cell(i, 5).value).strip() db_kit.insert(obj) else: l=s.split('、') j=0; for tname in l: obj = INOCC() obj.name = header + '-' + tname if header != '' else tname obj.name = obj.name.replace(' ', '') obj.code = str(sCode) + str(j).zfill(2) obj.code = obj.code.replace(' ', '') obj.insurance = insurance obj.pCode = sCode if sheet.cell(i, 5).ctype == 2: obj.type = int(sheet.cell(i, 5).value) if str(sheet.cell(i, 5).value).strip() != '' else '' else: obj.type = str(sheet.cell(i, 5).value).strip() db_kit.insert(obj) j+=1 elif str(sheet.cell(i, 2).value).strip().startswith('注:'): continue else: header = str(sheet.cell(i, 2).value).strip() lastF = fCode print(str(i))
def RAndWTPYRS_html(): soup = BeautifulSoup(open('/Users/yuhaihui8913/Documents/wh/太平洋人寿final.html')) print(soup.prettify()) trs = soup.find("table", class_="MsoNormalTable").find_all("tr") obj = None fCode = '' fName = '' sCode = '' header = '' lastF = '' for tr in trs: tds = tr.find_all("td") if len(tds)==5: if True: ps=tds[0].find_all('p') fCode = str(ps[0].get_text()).strip() if str(ps[0].get_text()).strip()!='' else fCode if len(ps)==1 else ps[0].get_text() fName = str(ps[0].get_text()).strip() if str(ps[0].get_text()).strip()!='' else fCode if len(ps)==1 else ps[1].get_text() if fCode != lastF: header = '' obj = INOCC() obj.code = fCode obj.name = fName obj.insurance = 'iotpyrs' if db_kit.existCheck(obj.code, 'iotpyrs') == 'no': db_kit.insert(obj) printObj(obj) if True: ps = tds[1].find_all('p') sCode = str(ps[0].get_text()).strip() if len(ps)==1 else str(ps[0].get_text()).strip() sName = str(ps[0].get_text()).strip() if len(ps)==1 else str(ps[1].get_text()).strip() obj = INOCC() obj.pCode = fCode obj.code = sCode obj.name = sName obj.insurance = 'iotpyrs' if db_kit.existCheck(obj.code, 'iotpyrs') == 'no': db_kit.insert(obj) printObj(obj) if True: obj = INOCC() spans=tds[2].p.contents obj.name = header + '-' + str(tds[2].p.contents[1].get_text()).strip() if header != '' else str(tds[2].p.contents[1].get_text()).strip() obj.name = obj.name.strip() obj.code = tds[2].p.contents[0].get_text().strip() obj.insurance = 'iotpyrs' obj.pCode = sCode obj.type = tds[4].p.span.get_text().strip() db_kit.insert(obj) printObj(obj) if len(tds)==4: if True: ps=tds[0].find_all('p') sCode = str(ps[0].get_text()).strip() sName = str(ps[1].get_text()).strip() obj = INOCC() obj.pCode = fCode obj.code = sCode obj.name = sName obj.insurance = 'iotpyrs' if db_kit.existCheck(obj.code, 'iotpyrs') == 'no': db_kit.insert(obj) printObj(obj) if True: obj = INOCC() obj.name = header + '-' + str(tds[1].p.contents[1].get_text().strip()) if header != '' else str(tds[1].p.contents[1].get_text()).strip() obj.name = obj.name.strip() obj.code = tds[1].p.contents[0].get_text().strip() obj.insurance = 'iotpyrs' obj.pCode = sCode obj.type = tds[3].p.span.get_text().strip() db_kit.insert(obj) printObj(obj) if len(tds)==3: if True: obj = INOCC() obj.name = header + '-' + str(tds[0].p.contents[1].get_text()).strip() if header != '' else str(tds[0].p.contents[1].get_text()).strip() obj.name = obj.name.strip() obj.code = tds[0].p.contents[0].get_text().strip() obj.insurance = 'iotpyrs' obj.pCode = sCode try: obj.type = tds[2].get_text().strip() except AttributeError : print(tds[2]+'===========================================================================') db_kit.insert(obj) printObj(obj) if len(tds)==2: ps = tds[0].find_all('p') if len(ps)==2: sCode = str(ps[0].get_text()).strip() sName = str(ps[1].get_text()).strip() obj = INOCC() obj.pCode = fCode obj.code = sCode obj.name = sName obj.insurance = 'iotpyrs' if db_kit.existCheck(obj.code, 'iotpyrs') == 'no': db_kit.insert(obj) printObj(obj) if tds[1].p.span.get_text().strip() != '': header=tds[1].p.span.get_text().strip() if len(tds)==1: if tds[0].p.span.get_text().strip() != '' and not str(tds[0].p.span.get_text()).strip().startswith('注:'): header=tds[0].p.span.get_text().strip() else: continue lastF = fCode
def getMainHTML(): session = requests.session() res = session.get(target, params=None, headers=random.sample(headers, 1)[0]) cookie = res.cookies soup = BeautifulSoup(res.content, 'lxml') div_a = soup.find('ess_contentpane').find_all('a', recursive=False) # div_a=div_a.find_next_siblings('a') bxgslx = '' for a in div_a: ''' 设置特殊查询的区域 ''' if a['name'] != '8245': continue pageTotal = soup.find(id='ess_ctr' + a['name'] + '_OrganizationList_lblPageNum').get_text() pageNum = soup.find(id='ess_ctr' + a['name'] + '_OrganizationList_lblAtPageNum').get_text() pageTotal = int(pageTotal if pageTotal else '1') pageNum = int(pageNum if pageNum else '1') bxgslx = soup.find(id='ess_ctr' + a['name'] + '_OrganizationList_lblClassName') print('当前页 %d ,一共 %d 页' % (pageNum, pageTotal)) __VIEWSTATE = '' __VIEWSTATEGENERATOR = '' for i in range(0, pageTotal + 1): logging.info('编号为 %s 的项目 执行了第 %d 次 ' % (a['name'], i)) print('编号为 %s 的项目 执行了第 %d 次 ' % (a['name'], i)) urls = [] p = {} header = {} if (i == 0): urls = soup.find(id='ess_ctr' + a['name'] + '_OrganizationList_rptCompany').find_all('a') __VIEWSTATE = soup.find(id='__VIEWSTATE')['value'] __VIEWSTATEGENERATOR = soup.find( id='__VIEWSTATEGENERATOR')['value'] else: p = { "__EVENTTARGET": ( None, "ess$ctr" + a['name'] + "$OrganizationList$lbnToPage", None, ), "ess$ctr" + a['name'] + "$OrganizationList$lblAtPageNum": (None, str(i + 1)), "__EVENTARGUMENT": (None, ""), # "ess$ctr8245$OrganizationList$lblAtPageNum": (None, ""), "__VIEWSTATEGENERATOR": (None, __VIEWSTATEGENERATOR), # "ess$ctr8247$OrganizationList$lblAtPageNum": (None, "") # , "ess$ctr8248$OrganizationList$lblAtPageNum": (None, ""), # "ess$ctr8249$OrganizationList$lblAtPageNum": (None, ""), # "ess$ctr8250$OrganizationList$lblAtPageNum": (None, ""), # "__essVariable": (None, ""), # "ScrollTop": (None, ""), # "select": (None, ""), # "select2": (None, ""), # "q": (None, ""), "__VIEWSTATE": (None, __VIEWSTATE) } header = random.sample(headers, 1)[0] header = header['User-Agent'] proxy = requests.get('http://192.168.50.229:5010/get').text proxies = {"http": proxy} _res = session.post( target, files=p, cookies=cookie, # proxies=proxies, headers={ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Referer": "http://www.circ.gov.cn/tabid/2576/Default.aspx", "Host": "www.circ.gov.cn", "Accept-Language": "zh-CN,zh;q = 0.9", "Origin": "http://www.circ.gov.cn", # "Content-Type": "multipart/form-data; boundary=----WebKitFormBoundaryl8pqZs1k7pDwvrlo", "User-Agent": header }) # print('分页请求参数') # print(_res.request.body.decode()) # print(_res.request.headers) # print(_res.content.decode()) _soup = BeautifulSoup(_res.content, 'lxml') urls = _soup.find(id='ess_ctr' + a['name'] + '_OrganizationList_rptCompany') # print(_res.text) if urls is not None: urls = urls.find_all('a') else: logging.error('编号为 %s 的项目 执行了第 %d 次 ' % (a['name'], i) + ",主页没有解析到正确的链接数据") continue __VIEWSTATE = soup.find(id='__VIEWSTATE')['value'] __VIEWSTATEGENERATOR = soup.find( id='__VIEWSTATEGENERATOR')['value'] print(urls) companyInfo = None insurer = None # companyInfos=[]; for url in urls: detailUrl = re.findall(r"'(.+?)'", str(url['onclick'])) ret = db_kit.findOnByUrl(detailUrl[0]) if (ret is None): companyInfo = getCompanyHtml(detailUrl[0]) if (companyInfo.__len__() == 9): insurer = db_kit.Insurer() insurer.orgName = companyInfo[0] insurer.orgType = companyInfo[1] insurer.cat = companyInfo[2] insurer.orgAddress = companyInfo[3] insurer.tel = companyInfo[4] insurer.leader = companyInfo[5] insurer.capital = companyInfo[6] insurer.registerAddress = companyInfo[7] insurer.state = companyInfo[8] insurer.url = detailUrl[0] insurer.catalog = bxgslx.get_text() db_kit.insert(insurer) else: logging.error(detailUrl[0] + '返回的内容不正确。没有解析出正确内容') # companyInfos.append(companyInfo) time.sleep(random.randint(1, 2))