def download_pdf(self, fileId): urlBase = "https://www.zhongdengwang.org.cn/rs/download.do?method=getDownload&type={}&id={}" # fileId = '05617033000668301597' dictNow = {} driver = self.driver try: try: driver.get(urlBase.format('00', fileId)) except TimeoutException: print("driver超时异常,忽略并尝试提取内容") for tt in range(300): if '初始登记' in driver.page_source: break time.sleep(0.1) for item in driver.find_elements_by_css_selector("span"): # 这里只下载第一个,所以第一个就Break name = item.text print("获取文件名", name) filePath = "./webDriver_download/" + name print(name, "开始下载") # 下载 driver.get(urlBase.format('01', fileId)) for i in range(10): if os.path.exists(filePath): time.sleep(0.2) ossPath = fileUpdate(filePath) # 上传成功后删除oss对象 os.remove(filePath) dictNow['fileId'] = fileId dictNow['name'] = name dictNow['ossPath'] = ossPath dictNow['insertTime'] = datetime.datetime.now() try: self.db[tableName].insert(dictNow) except: print("WARNING数据库链接异常,将会导致缓存失败") traceback.print_exc() dictNow['_id'] = str(dictNow['_id']) return dictNow else: print("文件还在浏览器下载中,请稍后!") time.sleep(0.6) # 100次0.1秒,共10秒 if dictNow == {}: print("10秒都没有下载成功,下载异常") dictNow['fileId'] = fileId dictNow['name'] = name dictNow['ossPath'] = None dictNow['insertTime'] = datetime.datetime.now() dictNow[ 'errMsg'] = "ERROR 下载10秒都没有下载完,可能是中登网下载链接失效无法下载" # 703 # 只采集第一个,其他的不采集,于是return dictNow['state'] = 703 return dictNow return dictNow except: traceback.print_exc() return None
def get_full_screen_oss(self): js = "var action=document.documentElement.scrollTop=10000" self.driver.execute_script(js) fileName = str(uuid.uuid1()) + 'full_snap.png' self.driver.save_screenshot(fileName) ossUrl = fileUpdate(fileName) os.remove(fileName) return ossUrl
def getCodeString(self): url_cw = "http://39.108.188.34:9020/middleware/identifyingChinese/upload.go?filename=fapiao" # 中文外网 url_ew = "http://39.108.188.34:9021/middleware/identifyingEnglish/upload.go?filename=fapiao" # 英文外网 # url_c = "http://localhost:9020/middleware/identifyingChinese/upload.go" # 中文本地 # url_e = "http://localhost:9021/middleware/identifyingEnglish/upload.go" # 英文本地 urlList = [url_cw, url_ew] # urlList = [url_c, url_e, url_cw, url_ew] pdfs = [] while True: # 尝试到成功为止 for url in urlList: try: # 进行验证码图片截取 try: filePath = self.get_image() except: print("[WARNING]使用旧方法请求王博验证码接口") traceback.print_exc() filePath = self.get_image_old() # TODO 未来删除 ossUrl = fileUpdate(filePath) print("验证码图片存储OSS", ossUrl) self.db['test_yanzhengma'].insert_one( {'ossUrl': ossUrl}) # 进行验证码图片识别 try: # 图片删除 file = {'file': open(filePath, 'rb')} colorStr = filePath.split('_')[-1].replace('.png', '') file.update({'etc': colorStr}) print("发起对【王博】验证码接口的请求 颜色", colorStr, "文件", filePath) response = requests.post(url, files=file) print("发起对【王博】验证码接口返回", response.text) try: dict_res = json.loads(response.text) except: print("验证码接口未返回JSON格式") continue if dict_res['state'] == 200: code = dict_res['data'] pdfs.append(code) else: print("算法要求更换验证码") break finally: os.remove(filePath) print("删除验证码") except NoSuchElementException: traceback.print_exc() print("【发票验真平台】出现定位不到标签的错误,可能是登陆状态丢失,重新启动浏览器,并保存截图") ossUrl = self.get_full_screen_oss() print("截图ossURL,并休息2秒", ossUrl) self.driver.restartDriver() time.sleep(5) except: print("验证码接口请求异常", url) traceback.print_exc() if len(pdfs) > 0: print("至少有一个有正确结果,随机取一个返回") ans = random.sample(pdfs, 1)[0] print("返回", ans) return ans # 更换图片再来一次 self.click100('yzm_img')
def _deal(self, input): if 'invoiceCode' in input: input['fpdm'] = input['invoiceCode'] if 'invoiceNo' in input: input['fphm'] = input['invoiceNo'] if 'date' in input: input['kprq'] = input['date'] if 'amount' in input: input['kjje'] = input['amount'] if 'checkCode' in input: input['jym'] = input['checkCode'] keyMustExist = ["fpdm", "fphm", "kprq", "kjje"] for key in keyMustExist: if key not in input: input.update({ 'state': 619, 'errMsg': "ERROR缺少必备参数{},fpdm、fphm、kprq、kjje、(校验码如果有必须填jym)为必须存在的参数" }) return input driver = self.driver while True: try: dictNow = input try: driver.get('https://inv-veri.chinatax.gov.cn/index.html') except TimeoutException: print("driver超时异常,忽略并尝试提取内容") print("正在输入发票信息") # 1 fpdm = input['fpdm'] fphm = input['fphm'] kprq = input['kprq'] kjje = input.get('kjje', '') #取后六位 jym = input.get('jym', '000000')[-6:] for i in range(100): if 'fpdm' in driver.page_source: break time.sleep(0.1) # 发票代码 driver.find_element_by_id('fpdm').clear() driver.find_element_by_id('fpdm').send_keys(fpdm) # 发票号码 driver.find_element_by_id('fphm').clear() driver.find_element_by_id('fphm').send_keys(fphm) # 开票日期 driver.find_element_by_id('kprq').clear() driver.find_element_by_id('kprq').send_keys(kprq) time.sleep(0.1) # 开具金额 if "开具金额(不含税)" in driver.page_source: driver.find_element_by_id('kjje').clear() driver.find_element_by_id('kjje').send_keys(kjje) else: driver.find_element_by_id('kjje').clear() driver.find_element_by_id('kjje').send_keys(jym) # 首先调整验证码大小 # imgTag = driver.find_element_by_id('yzm_img') # self.setAttribute(imgTag, "width", 90) # self.setAttribute(imgTag, "height", 35) # imgTag2 = driver.find_element_by_id('yzm_unuse_img') # self.setAttribute(imgTag2, "width", 90) # self.setAttribute(imgTag2, "height", 35) # 看输入是否有错误 for i in range(100): if "发票代码有误!" in driver.page_source: print("发票代码有误") dictNow = { 'errMsg': "ERROR发票代码有误!无法返回信息,请输入正确的数据或格式", 'state': 601 } dictNow['ossPathScreen'] = self.get_full_screen_oss() input.update(dictNow) return dictNow if "发票号码有误!" in driver.page_source: print("发票号码有误") dictNow = { 'errMsg': "ERROR发票号码有误!无法返回信息,请输入正确的数据或格式", 'state': 602 } dictNow['ossPathScreen'] = self.get_full_screen_oss() input.update(dictNow) return dictNow if "开票日期有误!" in driver.page_source: print("开票日期有误") dictNow = { 'errMsg': "ERROR开票日期有误!无法返回信息,请输入正确的数据或格式", 'state': 603 } dictNow['ossPathScreen'] = self.get_full_screen_oss() input.update(dictNow) return dictNow if "开票金额有误!" in driver.page_source: print("开票金额有误") dictNow = { 'errMsg': "ERROR开票金额有误!无法返回信息,请输入正确的数据或格式", 'state': 604 } dictNow['ossPathScreen'] = self.get_full_screen_oss() input.update(dictNow) return dictNow if "校验码有误!" in driver.page_source: print("校验码有误!") dictNow = { 'errMsg': "ERROR校验码有误!无法返回信息,请输入正确的数据或格式", 'state': 605 } dictNow['ossPathScreen'] = self.get_full_screen_oss() input.update(dictNow) return dictNow if "请输入发票号码" in driver.page_source: print("请输入发票号码") dictNow = {'errMsg': "ERROR请输入发票号码!", 'state': 606} dictNow['ossPathScreen'] = self.get_full_screen_oss() input.update(dictNow) return dictNow if "请输入发票代码" in driver.page_source: print("请输入发票代码") dictNow = {'errMsg': "ERROR请输入发票代码", 'state': 607} dictNow['ossPathScreen'] = self.get_full_screen_oss() input.update(dictNow) return dictNow if "请输入开具金额" in driver.page_source: print("请输入开具金额") dictNow = {'errMsg': "ERROR请输入开具金额", 'state': 608} dictNow['ossPathScreen'] = self.get_full_screen_oss() input.update(dictNow) return dictNow if "请输入开票日期" in driver.page_source: print("请输入开票日期") dictNow = {'errMsg': "ERROR请输入开票日期", 'state': 609} dictNow['ossPathScreen'] = self.get_full_screen_oss() input.update(dictNow) return dictNow if "请输入校验码" in driver.page_source: print("请输入校验码") dictNow = {'errMsg': "ERROR请输入校验码", 'state': 610} dictNow['ossPathScreen'] = self.get_full_screen_oss() input.update(dictNow) return dictNow if 'yzm_img' in driver.page_source: self._state = "已经出现验证码标签" print("出现验证码标签") break time.sleep(0.1) """ 验证码 """ time.sleep(0.1) try: for i in range(50): if '请输入验证码' in driver.page_source: self._state = "已经成功渲染验证码" print("成功渲染验证码") break time.sleep(0.1) else: print("5秒钟验证码还没有出现,刷新页面") print("错误截图", self.get_full_screen_oss()) self.driver.refresh() continue if '请输入验证码文字' in driver.page_source: print("不打不带颜色的码,跳过") continue try: # 不存在会抛异常 driver.find_element_by_id('yzminfo') \ .find_element_by_css_selector('font') \ .get_attribute('color') except: print("不打不带颜色的码,跳过") continue # 处理逻辑是:如果为空,换一张验证码.不为空则直接输入 imgCode1 = self.getCodeString() except: print("未知错误") traceback.print_exc() continue print("成功识别", imgCode1) inputa = imgCode1 driver.find_element_by_id('yzm').clear() driver.find_element_by_id('yzm').send_keys(inputa) print("【验证】已经完成所有信息输入") time.sleep(0.1) # 这里延时未来可以调低一点,为了录屏 action_chains = ActionChains(self.driver) action_chains.double_click( driver.find_element_by_id('checkfp')).perform() print("正在进行查验") time.sleep(0.1) for i in range(100): if 'popup_ok' in driver.page_source or 'iframe' in driver.page_source: break time.sleep(0.1) if '超过该张发票当日查验次数' in driver.page_source: print("超过次数") dictNow = { 'errMsg': "ERROR验真平台今日超过次数!无法返回信息", 'state': 413 } dictNow['ossPathScreen'] = self.get_full_screen_oss() return dictNow # 看输入是否有错误 elif "发票代码有误!" in driver.page_source: print("发票代码有误") dictNow = { 'errMsg': "ERROR发票代码有误!无法返回信息,请输入正确的数据或格式", 'state': 601 } dictNow['ossPathScreen'] = self.get_full_screen_oss() return dictNow elif "发票号码有误!" in driver.page_source: print("发票号码有误") dictNow = { 'errMsg': "ERROR发票号码有误!无法返回信息,请输入正确的数据或格式", 'state': 602 } dictNow['ossPathScreen'] = self.get_full_screen_oss() return dictNow elif "开票日期有误!" in driver.page_source: print("开票日期有误") dictNow = { 'errMsg': "ERROR开票日期有误!无法返回信息,请输入正确的数据或格式", 'state': 603 } dictNow['ossPathScreen'] = self.get_full_screen_oss() return dictNow elif "开票金额有误!" in driver.page_source: print("开票金额有误!") dictNow = { 'errMsg': "ERROR开票金额有误!无法返回信息,请输入正确的数据或格式", 'state': 604 } dictNow['ossPathScreen'] = self.get_full_screen_oss() return dictNow elif '校验码有误!' in driver.page_source: print("校验码有误") dictNow = { 'errMsg': "校验码有误!无法返回信息,请输入校验码或查看是否正确", 'state': 605 } dictNow['ossPathScreen'] = self.get_full_screen_oss() return dictNow elif '一分钟' in driver.page_source: print("访问过于频繁,休息60秒后再试") # popup_ok # driver.find_element_by_id('popup_ok').click() # 点击搜索按钮 self._state = "由于访问频率太快,系统强制占用进程休息60秒" time.sleep(60) continue elif 'popup_ok' in driver.page_source: print("验证码输入错误,重试流程") continue elif 'iframe' in driver.page_source: print("验证成功,跳转Iframe") driver.switch_to_frame(0) print(driver.page_source) filePath = 'fapiao/' + fpdm + '.png' driver.save_screenshot(filePath) ossPath = fileUpdate(filePath) invoiceData = {} try: invoiceData = self._parseInvoice(driver.page_source) except: invoiceData[ 'errMsg'] = "ERROR解析结构异常,发现新模板或者改版,请将这个返回交付开发者进行模板添加" invoiceData['html'] = driver.page_source invoiceData['errMsgText'] = traceback.format_exc() dictNow.update({ 'data': invoiceData, 'ossPathScreen': ossPath, 'errMsg': "ERROR解析结构异常,发现新模板或者改版,请将这个返回交付开发者进行模板添加!", 'state': 579 }) dictNow.update({ 'data': invoiceData, 'ossPathScreen': ossPath, 'errMsg': invoiceData.get('errMsg', 'maybe_success!'), 'state': invoiceData.get('state', 299) }) return dictNow except: print("未知异常,进行上报") traceback.print_exc() dictNow = {} dictNow.update({ 'errMsg': traceback.format_exc(), 'state': 599 }) return dictNow
def download_pdf(self, regno, companyName, pdfs): url = "https://www.zhongdengwang.org.cn/rs/conditionquery/byid.do?method=viewfile®no={}&type=1" dictNow = {} driver = self.driver try: try: driver.get(url.format(regno)) except TimeoutException: print("driver超时异常,忽略并尝试提取内容") for tt in range(100): if '下载' in driver.page_source: break time.sleep(0.1) for i, aItem in enumerate( driver.find_elements_by_css_selector("a")): # 这里只下载第一个,所以第一个就Break # 2019年4月1日 改成采集所有内容 name = aItem.text print("获取文件名", name) filePath = "./webDriver_download/" + name print(name, "尚未缓存,走下载上传路线") # 下载 if i == 0: # 初始登记 driver.execute_script("download('{}');".format( name.replace('.pdf', ''))) time.sleep(0.5) else: aItem.click() time.sleep(0.1) # href="javascript:download('02973013000359528048');" for aItem in driver.find_elements_by_css_selector("a"): name = aItem.text filePath = "./webDriver_download/" + name # filePath = "webdriver_service/django_start/webDriver_download/" + name print("正在检查文件是否下载完成", filePath) for i in range(100): if os.path.exists(filePath): ossPath = fileUpdate(filePath) # 上传成功后删除oss对象 os.remove(filePath) dictNow = {} dictNow['regno'] = regno dictNow['pdfName'] = name dictNow['companyName'] = companyName dictNow['ossPath'] = ossPath dictNow['insertTime'] = datetime.datetime.now() self.db[tableName].insert(dictNow) dictNow['_id'] = str(dictNow['_id']) pdfs.append(dictNow) break else: print("文件还在浏览器下载中,请稍后!") time.sleep(0.1) # 100次0.1秒,共10秒 if dictNow == {}: print("10秒都没有下载成功,下载异常") dictNow['regno'] = regno dictNow['pdfName'] = name dictNow['companyName'] = companyName dictNow['ossPath'] = None dictNow['insertTime'] = datetime.datetime.now() dictNow[ 'errMsg'] = "ERROR 下载10秒都没有下载完,可能是中登网下载链接失效无法下载" # 703 dictNow['state'] = 703 pdfs.append(dictNow) # 只采集第一个,其他的不采集,于是break except: traceback.print_exc() time.sleep(1) return pdfs