Example #1
0
    def download_pdf(self, fileId):
        urlBase = "https://www.zhongdengwang.org.cn/rs/download.do?method=getDownload&type={}&id={}"
        # fileId = '05617033000668301597'
        dictNow = {}
        driver = self.driver
        try:
            try:
                driver.get(urlBase.format('00', fileId))
            except TimeoutException:
                print("driver超时异常,忽略并尝试提取内容")
            for tt in range(300):
                if '初始登记' in driver.page_source:
                    break
                time.sleep(0.1)

            for item in driver.find_elements_by_css_selector("span"):

                # 这里只下载第一个,所以第一个就Break
                name = item.text
                print("获取文件名", name)
                filePath = "./webDriver_download/" + name
                print(name, "开始下载")
                # 下载
                driver.get(urlBase.format('01', fileId))
                for i in range(10):
                    if os.path.exists(filePath):
                        time.sleep(0.2)
                        ossPath = fileUpdate(filePath)
                        # 上传成功后删除oss对象
                        os.remove(filePath)
                        dictNow['fileId'] = fileId
                        dictNow['name'] = name
                        dictNow['ossPath'] = ossPath
                        dictNow['insertTime'] = datetime.datetime.now()
                        try:
                            self.db[tableName].insert(dictNow)
                        except:
                            print("WARNING数据库链接异常,将会导致缓存失败")
                            traceback.print_exc()
                        dictNow['_id'] = str(dictNow['_id'])
                        return dictNow
                    else:
                        print("文件还在浏览器下载中,请稍后!")
                        time.sleep(0.6)  # 100次0.1秒,共10秒
                if dictNow == {}:
                    print("10秒都没有下载成功,下载异常")
                    dictNow['fileId'] = fileId
                    dictNow['name'] = name
                    dictNow['ossPath'] = None
                    dictNow['insertTime'] = datetime.datetime.now()
                    dictNow[
                        'errMsg'] = "ERROR 下载10秒都没有下载完,可能是中登网下载链接失效无法下载"  # 703
                    # 只采集第一个,其他的不采集,于是return
                    dictNow['state'] = 703
                    return dictNow
                return dictNow
        except:
            traceback.print_exc()
            return None
    def get_full_screen_oss(self):

        js = "var action=document.documentElement.scrollTop=10000"
        self.driver.execute_script(js)
        fileName = str(uuid.uuid1()) + 'full_snap.png'
        self.driver.save_screenshot(fileName)
        ossUrl = fileUpdate(fileName)
        os.remove(fileName)
        return ossUrl
    def getCodeString(self):

        url_cw = "http://39.108.188.34:9020/middleware/identifyingChinese/upload.go?filename=fapiao"  # 中文外网
        url_ew = "http://39.108.188.34:9021/middleware/identifyingEnglish/upload.go?filename=fapiao"  # 英文外网
        # url_c = "http://localhost:9020/middleware/identifyingChinese/upload.go"  # 中文本地
        # url_e = "http://localhost:9021/middleware/identifyingEnglish/upload.go"  # 英文本地
        urlList = [url_cw, url_ew]
        # urlList = [url_c, url_e, url_cw, url_ew]

        pdfs = []

        while True:
            # 尝试到成功为止
            for url in urlList:
                try:
                    # 进行验证码图片截取
                    try:
                        filePath = self.get_image()
                    except:
                        print("[WARNING]使用旧方法请求王博验证码接口")
                        traceback.print_exc()
                        filePath = self.get_image_old()
                        # TODO 未来删除
                        ossUrl = fileUpdate(filePath)
                        print("验证码图片存储OSS", ossUrl)
                        self.db['test_yanzhengma'].insert_one(
                            {'ossUrl': ossUrl})
                    # 进行验证码图片识别
                    try:  # 图片删除
                        file = {'file': open(filePath, 'rb')}
                        colorStr = filePath.split('_')[-1].replace('.png', '')
                        file.update({'etc': colorStr})
                        print("发起对【王博】验证码接口的请求 颜色", colorStr, "文件", filePath)
                        response = requests.post(url, files=file)
                        print("发起对【王博】验证码接口返回", response.text)
                        try:
                            dict_res = json.loads(response.text)
                        except:
                            print("验证码接口未返回JSON格式")
                            continue
                        if dict_res['state'] == 200:
                            code = dict_res['data']
                            pdfs.append(code)
                        else:
                            print("算法要求更换验证码")
                            break
                    finally:
                        os.remove(filePath)
                        print("删除验证码")
                except NoSuchElementException:
                    traceback.print_exc()
                    print("【发票验真平台】出现定位不到标签的错误,可能是登陆状态丢失,重新启动浏览器,并保存截图")
                    ossUrl = self.get_full_screen_oss()
                    print("截图ossURL,并休息2秒", ossUrl)
                    self.driver.restartDriver()
                    time.sleep(5)

                except:
                    print("验证码接口请求异常", url)
                    traceback.print_exc()
            if len(pdfs) > 0:
                print("至少有一个有正确结果,随机取一个返回")
                ans = random.sample(pdfs, 1)[0]
                print("返回", ans)
                return ans
            # 更换图片再来一次
            self.click100('yzm_img')
    def _deal(self, input):

        if 'invoiceCode' in input:
            input['fpdm'] = input['invoiceCode']
        if 'invoiceNo' in input:
            input['fphm'] = input['invoiceNo']
        if 'date' in input:
            input['kprq'] = input['date']
        if 'amount' in input:
            input['kjje'] = input['amount']
        if 'checkCode' in input:
            input['jym'] = input['checkCode']
        keyMustExist = ["fpdm", "fphm", "kprq", "kjje"]
        for key in keyMustExist:
            if key not in input:
                input.update({
                    'state':
                    619,
                    'errMsg':
                    "ERROR缺少必备参数{},fpdm、fphm、kprq、kjje、(校验码如果有必须填jym)为必须存在的参数"
                })
                return input

        driver = self.driver
        while True:
            try:
                dictNow = input
                try:
                    driver.get('https://inv-veri.chinatax.gov.cn/index.html')
                except TimeoutException:
                    print("driver超时异常,忽略并尝试提取内容")
                print("正在输入发票信息")
                # 1
                fpdm = input['fpdm']
                fphm = input['fphm']
                kprq = input['kprq']
                kjje = input.get('kjje', '')
                #取后六位
                jym = input.get('jym', '000000')[-6:]
                for i in range(100):
                    if 'fpdm' in driver.page_source:
                        break
                    time.sleep(0.1)
                # 发票代码
                driver.find_element_by_id('fpdm').clear()
                driver.find_element_by_id('fpdm').send_keys(fpdm)
                # 发票号码
                driver.find_element_by_id('fphm').clear()
                driver.find_element_by_id('fphm').send_keys(fphm)
                # 开票日期
                driver.find_element_by_id('kprq').clear()
                driver.find_element_by_id('kprq').send_keys(kprq)
                time.sleep(0.1)
                # 开具金额
                if "开具金额(不含税)" in driver.page_source:
                    driver.find_element_by_id('kjje').clear()
                    driver.find_element_by_id('kjje').send_keys(kjje)
                else:
                    driver.find_element_by_id('kjje').clear()
                    driver.find_element_by_id('kjje').send_keys(jym)
                # 首先调整验证码大小
                # imgTag = driver.find_element_by_id('yzm_img')
                # self.setAttribute(imgTag, "width", 90)
                # self.setAttribute(imgTag, "height", 35)
                # imgTag2 = driver.find_element_by_id('yzm_unuse_img')
                # self.setAttribute(imgTag2, "width", 90)
                # self.setAttribute(imgTag2, "height", 35)

                # 看输入是否有错误

                for i in range(100):

                    if "发票代码有误!" in driver.page_source:
                        print("发票代码有误")
                        dictNow = {
                            'errMsg': "ERROR发票代码有误!无法返回信息,请输入正确的数据或格式",
                            'state': 601
                        }
                        dictNow['ossPathScreen'] = self.get_full_screen_oss()

                        input.update(dictNow)
                        return dictNow
                    if "发票号码有误!" in driver.page_source:
                        print("发票号码有误")
                        dictNow = {
                            'errMsg': "ERROR发票号码有误!无法返回信息,请输入正确的数据或格式",
                            'state': 602
                        }
                        dictNow['ossPathScreen'] = self.get_full_screen_oss()

                        input.update(dictNow)
                        return dictNow
                    if "开票日期有误!" in driver.page_source:
                        print("开票日期有误")
                        dictNow = {
                            'errMsg': "ERROR开票日期有误!无法返回信息,请输入正确的数据或格式",
                            'state': 603
                        }
                        dictNow['ossPathScreen'] = self.get_full_screen_oss()

                        input.update(dictNow)
                        return dictNow
                    if "开票金额有误!" in driver.page_source:
                        print("开票金额有误")
                        dictNow = {
                            'errMsg': "ERROR开票金额有误!无法返回信息,请输入正确的数据或格式",
                            'state': 604
                        }
                        dictNow['ossPathScreen'] = self.get_full_screen_oss()

                        input.update(dictNow)
                        return dictNow
                    if "校验码有误!" in driver.page_source:
                        print("校验码有误!")
                        dictNow = {
                            'errMsg': "ERROR校验码有误!无法返回信息,请输入正确的数据或格式",
                            'state': 605
                        }
                        dictNow['ossPathScreen'] = self.get_full_screen_oss()

                        input.update(dictNow)
                        return dictNow
                    if "请输入发票号码" in driver.page_source:
                        print("请输入发票号码")
                        dictNow = {'errMsg': "ERROR请输入发票号码!", 'state': 606}
                        dictNow['ossPathScreen'] = self.get_full_screen_oss()

                        input.update(dictNow)
                        return dictNow
                    if "请输入发票代码" in driver.page_source:
                        print("请输入发票代码")
                        dictNow = {'errMsg': "ERROR请输入发票代码", 'state': 607}
                        dictNow['ossPathScreen'] = self.get_full_screen_oss()

                        input.update(dictNow)
                        return dictNow
                    if "请输入开具金额" in driver.page_source:
                        print("请输入开具金额")
                        dictNow = {'errMsg': "ERROR请输入开具金额", 'state': 608}
                        dictNow['ossPathScreen'] = self.get_full_screen_oss()

                        input.update(dictNow)
                        return dictNow
                    if "请输入开票日期" in driver.page_source:
                        print("请输入开票日期")
                        dictNow = {'errMsg': "ERROR请输入开票日期", 'state': 609}
                        dictNow['ossPathScreen'] = self.get_full_screen_oss()

                        input.update(dictNow)
                        return dictNow
                    if "请输入校验码" in driver.page_source:
                        print("请输入校验码")
                        dictNow = {'errMsg': "ERROR请输入校验码", 'state': 610}
                        dictNow['ossPathScreen'] = self.get_full_screen_oss()

                        input.update(dictNow)
                        return dictNow

                    if 'yzm_img' in driver.page_source:
                        self._state = "已经出现验证码标签"
                        print("出现验证码标签")
                        break
                    time.sleep(0.1)
                """
                验证码
                """
                time.sleep(0.1)
                try:
                    for i in range(50):
                        if '请输入验证码' in driver.page_source:
                            self._state = "已经成功渲染验证码"
                            print("成功渲染验证码")
                            break
                        time.sleep(0.1)
                    else:
                        print("5秒钟验证码还没有出现,刷新页面")
                        print("错误截图", self.get_full_screen_oss())
                        self.driver.refresh()
                        continue
                    if '请输入验证码文字' in driver.page_source:
                        print("不打不带颜色的码,跳过")
                        continue
                    try:
                        # 不存在会抛异常
                        driver.find_element_by_id('yzminfo') \
                            .find_element_by_css_selector('font') \
                            .get_attribute('color')
                    except:
                        print("不打不带颜色的码,跳过")
                        continue

                    # 处理逻辑是:如果为空,换一张验证码.不为空则直接输入
                    imgCode1 = self.getCodeString()
                except:
                    print("未知错误")
                    traceback.print_exc()
                    continue

                print("成功识别", imgCode1)
                inputa = imgCode1
                driver.find_element_by_id('yzm').clear()
                driver.find_element_by_id('yzm').send_keys(inputa)
                print("【验证】已经完成所有信息输入")
                time.sleep(0.1)  # 这里延时未来可以调低一点,为了录屏
                action_chains = ActionChains(self.driver)
                action_chains.double_click(
                    driver.find_element_by_id('checkfp')).perform()
                print("正在进行查验")
                time.sleep(0.1)
                for i in range(100):
                    if 'popup_ok' in driver.page_source or 'iframe' in driver.page_source:
                        break
                    time.sleep(0.1)
                if '超过该张发票当日查验次数' in driver.page_source:
                    print("超过次数")
                    dictNow = {
                        'errMsg': "ERROR验真平台今日超过次数!无法返回信息",
                        'state': 413
                    }
                    dictNow['ossPathScreen'] = self.get_full_screen_oss()

                    return dictNow
                    # 看输入是否有错误
                elif "发票代码有误!" in driver.page_source:
                    print("发票代码有误")
                    dictNow = {
                        'errMsg': "ERROR发票代码有误!无法返回信息,请输入正确的数据或格式",
                        'state': 601
                    }
                    dictNow['ossPathScreen'] = self.get_full_screen_oss()

                    return dictNow
                elif "发票号码有误!" in driver.page_source:
                    print("发票号码有误")
                    dictNow = {
                        'errMsg': "ERROR发票号码有误!无法返回信息,请输入正确的数据或格式",
                        'state': 602
                    }
                    dictNow['ossPathScreen'] = self.get_full_screen_oss()

                    return dictNow
                elif "开票日期有误!" in driver.page_source:
                    print("开票日期有误")
                    dictNow = {
                        'errMsg': "ERROR开票日期有误!无法返回信息,请输入正确的数据或格式",
                        'state': 603
                    }
                    dictNow['ossPathScreen'] = self.get_full_screen_oss()
                    return dictNow
                elif "开票金额有误!" in driver.page_source:
                    print("开票金额有误!")
                    dictNow = {
                        'errMsg': "ERROR开票金额有误!无法返回信息,请输入正确的数据或格式",
                        'state': 604
                    }
                    dictNow['ossPathScreen'] = self.get_full_screen_oss()
                    return dictNow
                elif '校验码有误!' in driver.page_source:
                    print("校验码有误")
                    dictNow = {
                        'errMsg': "校验码有误!无法返回信息,请输入校验码或查看是否正确",
                        'state': 605
                    }
                    dictNow['ossPathScreen'] = self.get_full_screen_oss()

                    return dictNow
                elif '一分钟' in driver.page_source:
                    print("访问过于频繁,休息60秒后再试")
                    # popup_ok
                    # driver.find_element_by_id('popup_ok').click()  # 点击搜索按钮
                    self._state = "由于访问频率太快,系统强制占用进程休息60秒"
                    time.sleep(60)
                    continue
                elif 'popup_ok' in driver.page_source:
                    print("验证码输入错误,重试流程")
                    continue
                elif 'iframe' in driver.page_source:
                    print("验证成功,跳转Iframe")
                    driver.switch_to_frame(0)
                    print(driver.page_source)
                    filePath = 'fapiao/' + fpdm + '.png'
                    driver.save_screenshot(filePath)
                    ossPath = fileUpdate(filePath)
                    invoiceData = {}

                    try:
                        invoiceData = self._parseInvoice(driver.page_source)
                    except:
                        invoiceData[
                            'errMsg'] = "ERROR解析结构异常,发现新模板或者改版,请将这个返回交付开发者进行模板添加"
                        invoiceData['html'] = driver.page_source
                        invoiceData['errMsgText'] = traceback.format_exc()
                        dictNow.update({
                            'data': invoiceData,
                            'ossPathScreen': ossPath,
                            'errMsg':
                            "ERROR解析结构异常,发现新模板或者改版,请将这个返回交付开发者进行模板添加!",
                            'state': 579
                        })
                    dictNow.update({
                        'data':
                        invoiceData,
                        'ossPathScreen':
                        ossPath,
                        'errMsg':
                        invoiceData.get('errMsg', 'maybe_success!'),
                        'state':
                        invoiceData.get('state', 299)
                    })
                    return dictNow
            except:
                print("未知异常,进行上报")
                traceback.print_exc()
                dictNow = {}
                dictNow.update({
                    'errMsg': traceback.format_exc(),
                    'state': 599
                })
                return dictNow
Example #5
0
    def download_pdf(self, regno, companyName, pdfs):
        url = "https://www.zhongdengwang.org.cn/rs/conditionquery/byid.do?method=viewfile&regno={}&type=1"
        dictNow = {}
        driver = self.driver
        try:
            try:
                driver.get(url.format(regno))
            except TimeoutException:
                print("driver超时异常,忽略并尝试提取内容")
            for tt in range(100):
                if '下载' in driver.page_source:
                    break
                time.sleep(0.1)

            for i, aItem in enumerate(
                    driver.find_elements_by_css_selector("a")):
                # 这里只下载第一个,所以第一个就Break
                # 2019年4月1日 改成采集所有内容
                name = aItem.text
                print("获取文件名", name)
                filePath = "./webDriver_download/" + name
                print(name, "尚未缓存,走下载上传路线")
                # 下载
                if i == 0:
                    # 初始登记
                    driver.execute_script("download('{}');".format(
                        name.replace('.pdf', '')))
                    time.sleep(0.5)
                else:
                    aItem.click()
                    time.sleep(0.1)
                # href="javascript:download('02973013000359528048');"

            for aItem in driver.find_elements_by_css_selector("a"):
                name = aItem.text
                filePath = "./webDriver_download/" + name
                # filePath = "webdriver_service/django_start/webDriver_download/" + name
                print("正在检查文件是否下载完成", filePath)
                for i in range(100):
                    if os.path.exists(filePath):
                        ossPath = fileUpdate(filePath)
                        # 上传成功后删除oss对象
                        os.remove(filePath)
                        dictNow = {}
                        dictNow['regno'] = regno
                        dictNow['pdfName'] = name
                        dictNow['companyName'] = companyName
                        dictNow['ossPath'] = ossPath
                        dictNow['insertTime'] = datetime.datetime.now()

                        self.db[tableName].insert(dictNow)
                        dictNow['_id'] = str(dictNow['_id'])
                        pdfs.append(dictNow)
                        break
                    else:
                        print("文件还在浏览器下载中,请稍后!")
                        time.sleep(0.1)  # 100次0.1秒,共10秒
                if dictNow == {}:
                    print("10秒都没有下载成功,下载异常")
                    dictNow['regno'] = regno
                    dictNow['pdfName'] = name
                    dictNow['companyName'] = companyName
                    dictNow['ossPath'] = None
                    dictNow['insertTime'] = datetime.datetime.now()
                    dictNow[
                        'errMsg'] = "ERROR 下载10秒都没有下载完,可能是中登网下载链接失效无法下载"  # 703
                    dictNow['state'] = 703
                    pdfs.append(dictNow)
                    # 只采集第一个,其他的不采集,于是break
        except:
            traceback.print_exc()
            time.sleep(1)
        return pdfs