Example #1
0
    def parse_get_xj(self, response):
        """
        学籍
        """

        meta = response.meta
        item = meta["item"]
        archive_jsessionid = meta["archive_jsessionid"]
        try:
            xj_dict = {}
            for table_info in response.xpath('//div[@class="clearfix"]'):
                xj_img = table_info.xpath(
                    './/img[@class="xjxx-img"]/@src').extract_first("")
                xj_info_pic_data = get_content_by_requests(
                    xj_img,
                    headers=meta['headers'],
                    cookie_jar={'JSESSIONID': archive_jsessionid})
                xj_info_dict = self.pic_orc(xj_info_pic_data)
                xj_dict[self._get_xjxl_key(xj_info_dict)] = xj_info_dict

            meta['xj_dict'] = xj_dict
            yield Request(headers=meta['headers'],
                          cookies={'JSESSIONID': archive_jsessionid},
                          url=self.gdjy_xl_url,
                          callback=self.parse_get_xl,
                          meta=meta,
                          dont_filter=True,
                          errback=self.err_callback)
        except Exception:
            yield from self.except_handle(item["username"],
                                          "学信网---提取学籍信息数据解析异常")
Example #2
0
    def get_captcha_code(self, response):
        """
        获取验证码并识别,返回识别的验证码
        """
        headers = get_headers_from_response(response)
        sleep_time = self.settings.get("DOWNLOAD_DELAY", 0.3)
        captcha_id = self.captcha_id
        _get_captcha_code = self._get_captcha_code
        while True:
            form_data = {
                "captchaId": captcha_id,
                "random": str(rand_0_1()),
            }
            try:
                captcha_body = get_content_by_requests(
                    "http://zhixing.court.gov.cn/search/captcha.do?" +
                    urlencode(form_data),
                    headers=headers)
            except Exception:
                sleep(sleep_time)
                continue

            if captcha_body.startswith(b"<"):
                self.logger.error("被执行人---验证码:请开启JavaScript并刷新该页")
                sleep(self.sleep_time)
                continue

            captcha_code = _get_captcha_code(captcha_body)
            if len(captcha_code) == 4:
                return captcha_code
            sleep(sleep_time)
Example #3
0
    def http_request(self, url, method="GET", data=None, headers=None, cookies=None,
                     to_json=False, get_str=True, charset="utf-8", get_cookies=False):
        """
        封装HTTP请求
        :param url:
        :param data:
        :param method:
        :param headers:
        :param cookies:
        :param to_json:
        :param get_str:
        :param charset:
        :param get_cookies:
        :return:
        """
        try:
            cookies_dic = {}
            if headers is None:
                headers = self.headers or {}
            if cookies is None:
                cookies = self.cookies or {}
            if isinstance(cookies, list):
                cookies = {cookie["name"]: cookie["value"] for cookie in cookies}

            if method == "GET":
                if get_cookies:
                    resp = get_response_by_requests(url, headers=headers, cookie_jar=cookies)
                    cookies_dic = resp.cookies.get_dict()
                    content = resp.content
                else:
                    content = get_content_by_requests(url, headers=headers, cookie_jar=cookies)
            elif method == "POST":
                if get_cookies:
                    resp = get_response_by_requests_post(url, headers=headers, cookie_jar=cookies)
                    cookies_dic = resp.cookies.get_dict()
                    content = resp.content
                else:
                    content = get_content_by_requests_post(url, data=data, headers=headers, cookie_jar=cookies)
            else:
                self.logger.error("暂不支持该请求方法")
                return

            if not get_str:
                if get_cookies:
                    return {"result": content, "cookies": cookies_dic}
                return content
            page = content.decode(charset)

            if not to_json:
                if get_cookies:
                    return {"result": page, "cookies": cookies_dic}
                return page

            if get_cookies:
                return {"result": json_loads(page), "cookies": cookies_dic}

            return json_loads(page)
        except Exception:
            self.logger.exception("请求出错: url:%s" % url)
            return
 def send_pic_get_ponint6_to_ssdb(self, img_url, img_desc, username):
     captcha_body = get_content_by_requests(img_url, self.headers)
     captcha_code = self.ask_image_captcha(captcha_body,
                                           username,
                                           file_type=".jpeg",
                                           image_describe=img_desc)
     code = captcha_code.split(',')
     return code[0], code[1], code[2], code[3], code[4], code[5]
Example #5
0
 def _get_callDetail_image_captcha(self, response, username):
     cookiejar = get_cookiejar_from_response(response)
     headers = get_headers_from_response(response)
     url = "http://shop.10086.cn/i/authImg?t=" + str(rand_0_1())
     self.set_image_captcha_headers_to_ssdb(headers, username)
     return get_content_by_requests(url,
                                    headers=headers,
                                    cookie_jar=cookiejar)
Example #6
0
    def user_login(self, response):
        meta = response.meta
        item = meta['item']
        username = item['username']
        try:
            login_js = json_loads(response.text)
            retcode = login_js['retcode']
            if retcode == '4049':
                # 发送验证码
                headers = get_headers_from_response(response)
                captcha_url = meta['captcha_url']
                self.set_image_captcha_headers_to_ssdb(
                    headers, username)  # 将头信息传递给 django
                self.set_email_img_url_to_ssdb(captcha_url, username)

                captcha_body = get_content_by_requests(captcha_url,
                                                       headers=headers)
                captcha_code = self.ask_image_captcha(captcha_body,
                                                      username,
                                                      file_type=".png")
                meta["captecha_code"] = {
                    'door': captcha_code,
                    'pcid': meta['pcid']
                }

                # 异地登录 需要验证码验证
                su = self._enb64(self._url_encode(username)).decode()
                step1_url = 'http://login.sina.com.cn/sso/prelogin.php?entry=sso' \
                            '&callback=sinaSSOController.preloginCallBack' \
                            '&su=%s&rsakt=mod&client=ssologin.js(v1.4.19)' % su
                yield Request(url=step1_url,
                              callback=self.step1,
                              meta=meta,
                              dont_filter=True,
                              errback=self.err_callback)
            elif retcode in ['101', '2070', '2079']:
                err_message = '登录名或密码错误!'
                yield from self.error_handle(
                    username,
                    msg="sina---登录失败:(username:%s, password:%s) %s" %
                    (username, item['password'], err_message),
                    tell_msg=err_message)
            else:
                meta["cross"] = login_js['crossDomainUrlList'][0]
                yield Request(url=login_js['crossDomainUrlList'][1],
                              callback=self.cross_domain_one,
                              meta=meta,
                              dont_filter=True,
                              errback=self.err_callback)
        except Exception:
            yield from self.except_handle(
                username,
                msg="用户登录解析异常",
                tell_msg="邮箱登录失败,请刷新重试",
                logout_request=self.get_logout_request(meta))
Example #7
0
 def download_pic(self, url):
     """
     下载验证码图片
     :param url:
     :return:
     """
     try:
         return get_content_by_requests(url, headers=self.headers)
     except Exception:
         self.logger.exception("下载验证码图片失败")
         return None
Example #8
0
 def _get_captcha_code_by_phantomJS(self, driver, username):
     url = "http://m.pingan.com/t/ImageGif.do?v=2&rd=%s&amp;imageNum=359" % random(
     )
     cookiejar = get_cookies_dict_from_webdriver(driver)
     captcha_body = get_content_by_requests(url,
                                            self.headers,
                                            cookie_jar=cookiejar)
     captcha_code = self.ask_image_captcha(captcha_body,
                                           username,
                                           file_type=".jpeg")
     return captcha_code
Example #9
0
    def parse(self, response):
        headers = self.headers.copy()
        meta = response.meta
        meta['headers'] = headers
        meta['captcha_retry_time'] = 5
        item = meta["item"]
        item['xueli'] = []

        if response.status == 302:
            yield from self.parse_login(response)
        else:
            try:
                self.logger.info("请求登录接口->%s" % self.user_login)
                lt = response.xpath(
                    '//input[@name="lt"]/@value').extract_first("")
                captcha_code = None
                # self.logger.debug('captcha1 ' + str(response.xpath('//input[@id="captcha"]').extract_first("")))
                # self.logger.debug('captcha2 ' + str(response.xpath('//div[@class="ct_input errors"]').extract_first("")))
                if response.xpath('//input[@id="captcha"]').extract_first() \
                        or response.xpath('//div[@class="ct_input errors"]').extract_first():
                    meta['captcha_retry_time'] -= 1
                    cookiejar = get_cookiejar_from_response(response)
                    url = "https://account.chsi.com.cn/passport/captcha.image?id=" + str(
                        random())
                    captcha_body = get_content_by_requests(
                        url,
                        headers,
                        cookie_jar=cookiejar,
                        proxies={
                            "https": response.meta['proxy'],
                            "http": response.meta['proxy']
                        })
                    captcha_code = self.ask_image_captcha(captcha_body,
                                                          item['username'],
                                                          file_type=".jpeg")
                req_data = self.get_req_data(self.user_login,
                                             user_name=item["username"],
                                             password=item["password"],
                                             lt=lt,
                                             captcha=captcha_code)
                self.logger.debug(req_data)
                headers['Cookie'] = response.headers.get('Set-Cookie').decode()
                headers['Referer'] = self._start_url_
                r = FormRequest(headers=headers,
                                url=self._start_url_,
                                callback=self.parse_login,
                                formdata=req_data,
                                meta=meta,
                                errback=self.err_callback,
                                dont_filter=True)
                yield r
            except Exception:
                yield from self.except_handle(meta["item"]["username"],
                                              "学信网---爬虫解析入口异常")
Example #10
0
 def _verify_callDetail_captcha(self, response, username, captcha_code):
     """
     返回是否正确
     """
     cookiejar = get_cookiejar_from_response(response)
     headers = get_headers_from_response(response)
     url = "http://shop.10086.cn/i/v1/res/precheck/" + username + "?captchaVal=" \
           + captcha_code + "&_=" + get_js_time()
     info = get_content_by_requests(url,
                                    headers=headers,
                                    cookie_jar=cookiejar)
     return b'"retCode":"000000"' in info
Example #11
0
 def verify_captcha(self, response, captcha_code):
     """
     移动有一个url可以在不提交表单的情况下,先检查验证码是否正确
     返回是否正确
     """
     cookiejar = get_cookiejar_from_response(response)
     headers = get_headers_from_response(response)
     url = b"https://login.10086.cn/verifyCaptcha?inputCode=" \
           + captcha_code.encode('unicode-escape', "ignore").replace(b"\\u", b"")
     info = get_content_by_requests(url,
                                    headers=headers,
                                    cookie_jar=cookiejar)
     return b'"resultCode":"0"' in info
Example #12
0
    def parse_get_xl(self, response):
        meta = response.meta
        item = meta["item"]
        xj_dict = meta['xj_dict']
        try:
            xueli = item['xueli']
            archive_jsessionid = meta["archive_jsessionid"]
            for table_info in response.xpath('//div[@class="clearfix"]'):
                # 学历信息中的毕业证照片链接
                url = table_info.xpath(
                    './/div[@class="pic"]/img/@src').extract_first()
                if url and 'no-photo' not in url:
                    pic_data = get_content_by_requests(
                        'https://my.chsi.com.cn' + url,
                        headers=meta['headers'],
                        cookie_jar={'JSESSIONID': archive_jsessionid})
                else:
                    pic_data = b''
                xl_img = table_info.xpath(
                    './/img[@class="xjxx-img"]/@src').extract_first("")
                xl_info_pic_data = get_content_by_requests(
                    xl_img,
                    headers=meta['headers'],
                    cookie_jar={'JSESSIONID': archive_jsessionid})
                xl_info_dict = self.pic_orc(xl_info_pic_data)
                xl_info_dict['photo'] = pic_data
                key = self._get_xjxl_key(xl_info_dict)
                if key in xj_dict:
                    xueli.append(
                        self.merge_dict(xj_dict.pop(key), xl_info_dict))
                else:
                    xueli.append(xl_info_dict)

            xueli.extend(xj_dict.values())

            yield from self.crawling_done(item)
        except Exception:
            yield from self.except_handle(item["username"],
                                          "学信网---提取学历信息数据解析异常")
Example #13
0
    def parse(self, response):
        item = response.meta["item"]
        username = item["username"]
        password = item["password"]

        url = self.login_url + get_js_time() + "?returnURL=account%2Findex%2FtransferList"
        driver = self.load_page_by_webdriver(url, "//div[@id='pwdObject1-btn-pan']")
        try:
            try:
                if driver.find_element_by_id("verifyCode").is_displayed():
                    url = "https://bank.pingan.com.cn/ibp/portal/pc/getVcode2.do?" + get_js_time()
                    cookiejar = get_cookies_dict_from_webdriver(driver)
                    capcha_body = get_content_by_requests(url, headers=self.headers,
                                                          cookie_jar=cookiejar)
                    captcha_code = self.ask_image_captcha(capcha_body, username)
                    driver.execute_script('document.getElementById("verifyCode").value="'
                                          + captcha_code + '";')
            except CaptchaTimeout:
                raise
            except Exception:
                self.logger.exception("平安银行---图片验证码")

            # 填写用户名和密码
            driver.execute_script('document.getElementById("pwdObject1-btn-pan").click();'
                                  'document.getElementById("userName").value="%s";'
                                  'document.getElementById("pwdObject1-input").value="%s";'
                                  'document.getElementById("login_btn").click();'
                                  % (username, password))

            try:
                self.wait_xpath(driver, "//li[@id='safe_logout']", raise_timeout=True, timeout=6)
            except TimeoutException:
                page_source = driver.page_source
                if "密码错" in page_source:
                    yield from self.error_handle(username, "平安银行---登录",
                                                 driver.find_element_by_xpath("//span[@id='errorLoginMsg']").text)
                elif "证码错" in page_source:
                    yield from self.error_handle(username, "平安银行---登录",
                                                 driver.find_element_by_xpath("//span[@id='verifyError']").text)
                else:
                    yield from self.error_handle(username, "平安银行---登录异常(%s)" % page_source, "登录失败")
            else:
                item["balance"], item["trade_records"] = self._login_success(driver)
                yield from self.crawling_done(item)
        except CaptchaTimeout:
            yield from self.error_handle(username, "平安银行---等待验证码超时",
                                         tell_msg="等待验证码超时,请刷新页面重试。。")
        except Exception:
            yield from self.except_handle(username, "平安银行---爬取", "爬取异常")
        finally:
            driver.quit()
Example #14
0
 def check_need_sms_captcha(self, response, username):
     cookiejar = get_cookiejar_from_response(response)
     headers = get_headers_from_response(response)
     form_data = {
         "accountType": "01",
         "account": username,
         "timestamp": get_js_time(),
         # "pwdType": "02",
     }
     info = get_content_by_requests(
         "https://login.10086.cn/needVerifyCode.htm?" +
         urlencode(form_data),
         headers=headers,
         cookie_jar=cookiejar)
     return b'"needVerifyCode":"1"' in info
Example #15
0
 def verify_captcha(self, response, captcha_code, cookies_dict):
     """
     联通有一个url可以在不提交表单的情况下,先检查验证码是否正确
     返回是否正确
     """
     headers = get_headers_from_response(response)
     the_time = get_js_time()
     form_data = {
         'verifyCode':
         captcha_code,
         'verifyType':
         "1",
         '_':
         int(the_time) + 1,
         'callback':
         "jQuery1720" + str(randint(1E16, 1E17 - 1)) + "_" + the_time
     }
     url = "https://uac.10010.com/portal/Service/CtaIdyChk?" + urlencode(
         form_data)
     info = get_content_by_requests(url,
                                    headers=headers,
                                    cookie_jar=cookies_dict)
     return b'"resultCode":"true"' in info
Example #16
0
    def parse(self, response):
        meta = response.meta
        item = meta["item"]
        username = item['username']

        driver = self.load_page_by_webdriver(response.url)
        pid = driver.iedriver.process.pid
        try:
            sleep(3)
            with Ddxoft(pid) as dd_opt:
                driver.execute_script(
                    'document.getElementsByName("logonNoCert")[0].value="{0}";'
                    'document.getElementsByName("logonNoCert")[0].focus()'.
                    format(username))
                dd_opt.dd_tab()
                sleep(0.1)
                for i in item['password']:
                    dd_opt.dd_keyboard(i)
                    sleep(0.1)

            if driver.find_element_by_id('verifyId').is_displayed():
                captcha_input = driver.find_element_by_name('verifyCode')
                captcha_url = driver.find_element_by_id(
                    'pinImg').get_attribute('src')
                capcha_cookies = get_cookies_dict_from_webdriver(driver)
                capcha_body = get_content_by_requests(
                    captcha_url,
                    headers=self.headers,
                    cookie_jar=capcha_cookies)
                captcha_code = self.ask_image_captcha(capcha_body, username)
                captcha_input.send_keys(captcha_code)
                sleep(0.5)
                # 验证码正误弹窗
                try:
                    driver.switch_to.alert.accept()
                except Exception:
                    pass
                if 'stop_button' in driver.find_element_by_id(
                        'verifyImg').get_attribute('src'):
                    yield from self.error_handle(
                        username,
                        "中信银行---登录失败:(username:%s, password:%s) %s" %
                        (username, item["password"], '验证码输入错误'),
                        tell_msg='验证码错误')
                    return

            butt_submit = driver.find_element_by_id('logonButton')
            butt_submit_onclick_js = butt_submit.get_attribute('onclick')
            driver.execute_script(butt_submit_onclick_js)
            # butt_submit.click()
            sleep(2)
            try:
                errorReason = driver.find_element_by_class_name('errorReason')
                message = errorReason.text
                yield from self.error_handle(
                    username,
                    "中信银行---登录失败:(username:%s, password:%s) %s" %
                    (username, item["password"], message),
                    tell_msg=message)
                return
            except NoSuchElementException:
                pass

            # if driver.find_element_by_id('firstLogonMdyPwdID').is_displayed():  # 出现这个元素要重置密码
            #     # 只出现了一次 不知道是啥样了。先记在这里
            #     pass
            ''' 跳过修改密码'''
            if driver.find_elements_by_xpath('//a[contains(text(),"跳过")]'):
                jump_js = (driver.find_element_by_xpath(
                    '//a[contains(@onclick,"jumpTip")]'
                ).get_attribute('onclick') or driver.find_element_by_xpath(
                    '//a[contains(text(),"跳过")]').get_attribute('onclick'))
                driver.execute_script(jump_js)
                if driver.find_element_by_id('jumpTipDiv').is_displayed():
                    jump_ok_js = driver.find_element_by_id(
                        'jump').get_attribute('onclick')
                    driver.execute_script(jump_ok_js)
            ''' 短信验证  (突然又不出现了暂未测完全)'''
            if driver.find_elements_by_name('mdpBtn'):
                get_sms_code_js = driver.find_element_by_name(
                    'mdpBtn').get_attribute('onclick').replace(
                        'javascript:', '')
                # check_ok_js = driver.find_element_by_id('checkId').get_attribute('onclick').replace('javascript:', '')
                btn_next_js = driver.find_element_by_id(
                    'nextStep').get_attribute('onclick').replace(
                        'javascript:', '')
                max_loop_time = 10
                while not driver.find_element_by_id(
                        'checkId').is_selected() and max_loop_time > 0:
                    driver.find_element_by_id('checkId').click()
                    max_loop_time -= 1
                driver.execute_script(get_sms_code_js)
                sms_code = self.ask_sms_captcha(username)
                driver.execute_script(
                    'document.getElementsByName("mobilDynPwdStr1")[0].value="{0}";'
                    .format(sms_code))
                sleep(1)

                driver.execute_script(btn_next_js)
                if driver.find_elements_by_xpath('//dl/dd/b'):
                    err_message = driver.find_element_by_xpath(
                        '//dl/dd/b').text
                    yield from self.error_handle(username,
                                                 "中信银行---短信验证码出错:%s " %
                                                 err_message,
                                                 tell_msg=err_message)
                    return

            self.wait_xpath(driver, '//input[@id="searchItemId"]')
            EMP_SID = self.EMP_SID_pattern.search(
                driver.find_element_by_id('formLogout').get_attribute(
                    'action'))
            if EMP_SID:
                EMP_SID = EMP_SID.group(1)
            else:
                driver.execute_script(
                    'document.getElementById("searchItemId").value="账户查询";')
                driver.find_element_by_id('searchItemId').send_keys(Keys.ENTER)
                driver.find_element_by_id('searchItemId').send_keys(Keys.ENTER)
                driver.find_element_by_id('searchItemId').send_keys(Keys.ENTER)
                sleep(1)
                mainframe = driver.find_element_by_id('mainframe')
                driver.switch_to.frame(mainframe)
                try:
                    EMP_SID = self.EMP_SID_pattern.search(
                        driver.page_source).group(1)
                except AttributeError:
                    yield from self.error_handle(username,
                                                 "中信银行---解析失败:EMP_SID 获取失败",
                                                 tell_msg='解析出错!')
                    return
            meta["EMP_SID"] = EMP_SID
            cookies = get_cookies_dict_from_webdriver(driver)
            yield FormRequest(url=self.account_balance_url.format(EMP_SID),
                              callback=self.parse_balance,
                              headers=self.headers,
                              cookies=cookies,
                              meta=meta,
                              dont_filter=True,
                              errback=self.err_callback)
        except CaptchaTimeout:
            yield from self.error_handle(username,
                                         "中信银行---等待验证码超时",
                                         tell_msg="等待验证码超时,请刷新页面重试。。")
        except Exception:
            yield from self.except_handle(username,
                                          "中信银行---解析失败",
                                          tell_msg="解析失败")
        finally:
            driver.quit()
Example #17
0
 def _get_pic_by_requests(self, driver, url):
     cookiejar = get_cookies_dict_from_webdriver(driver)
     return get_content_by_requests(url, self.headers, cookie_jar=cookiejar)
Example #18
0
    def parse(self, response):
        item = response.meta["item"]
        username = item["username"]

        driver = self.getdriver(
            executable_path=self.settings["IE_EXECUTABLE_233_PATH"],
            browser_type="IE")
        pid = driver.iedriver.process.pid
        try:
            wait = ui.WebDriverWait(driver, 20)
            driver.get(response.url)
            wait.until(lambda dr: dr.find_element_by_id("loginButton"))
            driver.maximize_window()

            with Ddxoft(pid) as visual_keyboard:
                driver.execute_script(
                    "username_input=document.getElementById('writeUserId');"
                    "username_input.value='%s';"
                    "username_input.focus();" % username)
                visual_keyboard.dd_tab()
                sleep(0.1)
                for key in item["password"]:
                    visual_keyboard.dd_keyboard(key)
                    sleep(0.1)

            # 检查是否需要输入验证码
            captcha_input = driver.find_element_by_id("_vTokenName")
            if captcha_input and captcha_input.is_displayed():
                # 验证码
                captcha_url = "https://nper.cmbc.com.cn/pweb/GenTokenImg.do?random=" + str(
                    random())
                capcha_cookies = get_cookies_dict_from_webdriver(driver)
                capcha_body = get_content_by_requests(
                    captcha_url,
                    headers=self.headers,
                    cookie_jar=capcha_cookies)
                captcha_code = self.ask_image_captcha(capcha_body, username)
                driver.execute_script(
                    "document.getElementById('_vTokenName').value='" +
                    captcha_code + "';")

            driver.execute_script(
                "document.getElementById('loginButton').click();")
            sleep(2)
            curr_url = driver.current_url
            if 'main.html' not in curr_url:  # 验证是否登录成功
                error = driver.find_element_by_id('jsonError').text
                yield from self.error_handle(
                    username,
                    msg="民生银行---登录失败:(username:%s, password:%s) %s" %
                    (username, item['password'], '账号密码错误'),
                    tell_msg=error)
                return
            # wait.until(lambda dr: dr.find_element_by_id('welcomeMainContent'))
            wait.until(lambda dr: dr.find_elements_by_xpath(
                "//form[@id='welcomeMainContent']"
                "//a[text()='查询明细']"))
            # balance = driver.find_element_by_xpath('//div[@class="sy_m1_x v-scope"]/div/span').text
            item['balance'] = driver.find_element_by_xpath(
                '//div[@class="sy_m1_x v-scope"]'
                '/div/span[@class="v-binding"]').text
            # 跳转到银行流水界面
            driver.execute_script(
                'document.getElementById("welcomeMainContent")'
                '.getElementsByClassName("yuanbj")[2].click()')
            wait.until(
                lambda dr: dr.find_element_by_id("QuickTitle").is_displayed())
            end_date = date.today()
            trade_records = item["trade_records"]
            for i in range(4):  # 只能在3个月的跨度里查询
                start_date = self.get_theday_3_month_ago(end_date)
                try:
                    wait.until(lambda dr: dr.find_elements_by_xpath(
                        "//input[@v-model='BeginDate']"))

                    # 选择开始日期
                    driver.execute_script(
                        '$("input[v-model=BeginDate]").focus();')
                    sleep(0.1)
                    driver.execute_script(
                        '$("select[data-handler=selectYear]").val("%s");'
                        '$("select[data-handler=selectYear]").change();' %
                        start_date.year)
                    sleep(0.1)
                    driver.execute_script(
                        '$("select[data-handler=selectMonth]").val("%s");'
                        '$("select[data-handler=selectMonth]").change();' %
                        (start_date.month - 1))
                    sleep(0.1)
                    day = start_date.day
                    driver.execute_script('''
                    $("table.ui-datepicker-calendar").find("a:contains(%s)").each(function(){
                        if ($(this).text() == "%s" 
                            && $(this).attr("class").indexOf("ui-priority-secondary") == -1) {
                            this.click();
                        }
                    });''' % (day, day))

                    sleep(1)
                    if 'display: none' in driver.find_element_by_id(
                            "jsonErrorShow").get_attribute("style"):
                        wait.until(lambda dr: dr.find_element_by_id(
                            "DataTable").is_displayed())
                        # json_data = {"AcNo":"6226220681208897","BankAcType":"03","BeginDate":start_date.strftime("%Y-%m-%d"),"EndDate":end_date.strftime("%Y-%m-%d"),"AcName":u"文学","Remark":"-","Fee":"0.00","FeeRemark":"-","Ten":"Ten","SubAcSeq":"0001","currentIndex":0,"uri":"/pweb/ActTrsQry.do"}
                        # _cookies = get_cookies_dict_from_webdriver(driver)
                        # response = requests.post("https://nper.cmbc.com.cn/pweb/ActTrsQry.do", headers=self.headers,
                        #                          json=json_data, cookies=_cookies, verify=False)
                        # response_text = response.text
                        trade_records.extend(
                            self.parse_item_from_webpage(driver, wait))
                except Exception:
                    self.logger.exception("民生银行---时间筛选")
                end_date = start_date

            yield from self.crawling_done(item)
        except CaptchaTimeout:
            yield from self.error_handle(username,
                                         "民生银行---等待验证码超时",
                                         tell_msg="等待验证码超时,请刷新页面重试。。")
        except Exception:
            yield from self.except_handle(username,
                                          msg="民生银行---错误信息:",
                                          tell_msg="银行流水数据爬取失败,请刷新页面重试!")
        finally:
            driver.quit()
Example #19
0
    def parse(self, response):
        item = response.meta["item"]
        username = item["username"]

        driver = self.load_page_by_webdriver(response.url,
                                             "//input[@id='USERID']")
        try:
            # 因为密码控件有javascript加密调用,所以必须用send_keys
            driver.find_element_by_id("LOGPASS").send_keys(item["password"])
            driver.execute_script(
                'document.getElementById("LOGPASS").blur();'
                'user_input=document.getElementById("USERID");'
                'user_input.focus();'
                'user_input.value="' + username + '";')

            # 验证码
            try:
                driver.find_element_by_id("PT_CONFIRM_PWD")
            except NoSuchElementException:
                pass
            else:
                captcha_url = driver.find_element_by_id(
                    "fujiama").get_attribute("src")
                cookiejar = get_cookies_dict_from_webdriver(driver)
                capcha_body = get_content_by_requests(captcha_url,
                                                      headers=self.headers,
                                                      cookie_jar=cookiejar)
                captcha_code = self.ask_image_captcha(capcha_body, username)
                driver.execute_script(
                    'document.getElementById("PT_CONFIRM_PWD").value="' +
                    captcha_code + '";')

            driver.execute_script(
                'document.getElementById("loginButton").click();')
            # =================================登录结束================================= #

            # 点击明细
            iframe_xpath = "//div[@id='w3']/iframe"
            self.wait_xpath(driver, iframe_xpath)
            driver.switch_to.frame(driver.find_element_by_xpath(iframe_xpath))
            self.wait_xpath(driver, "//span[@data_id='mingxi']")

            html = find_str_range(driver.page_source, '<div class="card_list"',
                                  "/ul>")
            bs_obj = BeautifulSoup(html, "lxml")
            mingxi_values = bs_obj.find("span",
                                        {"data_id": "mingxi"})["values"]

            driver.execute_script("""$("span[values='%s']")[0].click()""" %
                                  mingxi_values)
            driver.switch_to.window(driver.window_handles[1])
            driver.switch_to.frame("sear")

            end_date = date.today()
            start_date_str = (end_date - self.date_delta).strftime("%Y%m%d")
            end_date_str = end_date.strftime("%Y%m%d")
            driver.execute_script(
                'document.getElementById("START_DATE").value="%s";'
                'document.getElementById("END_DATE").value="%s";'
                'toSqOrQd("1");' % (start_date_str, end_date_str))
            self.wait_xpath(driver, "//iframe[@id='result']")

            item["trade_records"] = self._download_trade_records(
                driver, start_date_str, end_date_str, mingxi_values)

            yield from self.crawling_done(item)
        except CaptchaTimeout:
            yield from self.error_handle(username,
                                         "建设银行---等待验证码超时",
                                         tell_msg="等待验证码超时,请刷新页面重试。。")
        except Exception:
            yield from self.except_handle(username,
                                          "建设银行----爬取异常:",
                                          tell_msg="爬取建设银行账户流水失败")
        finally:
            driver.quit()
Example #20
0
    def parse(self, response):
        meta = response.meta
        item = meta["item"]
        username = item['username']

        driver = self.load_page_by_webdriver(response.url)
        pid = driver.iedriver.process.pid
        try:
            # 停顿2秒 准备模拟键盘输入
            sleep(2)
            with Ddxoft(pid) as dd_opt:
                for i in username:
                    dd_opt.dd_keyboard(i)
                sleep(0.5)
                dd_opt.dd_tab()
                sleep(0.5)
                for i in item['password']:
                    dd_opt.dd_keyboard(i)

            butt_submit = driver.find_element_by_id('LoginBtn')
            # butt_submit.click()
            butt_submit_onclick_js = butt_submit.get_attribute(
                'onclick').replace('javascript:', '')
            driver.execute_script(butt_submit_onclick_js)
            sleep(1)

            # 有可能有附加码(附加码在点击登录后生成 所以填完还得登一下)
            try:
                have_extra = driver.find_element_by_id(
                    'ImgExtraPwd').is_displayed()
            except WebDriverException:
                have_extra = False

            if have_extra:
                captcha_input = driver.find_element_by_id('ExtraPwd')
                captcha_url = driver.find_element_by_id(
                    'ImgExtraPwd').get_attribute('src')
                capcha_cookies = get_cookies_dict_from_webdriver(driver)
                capcha_body = get_content_by_requests(
                    captcha_url,
                    headers=self.headers,
                    cookie_jar=capcha_cookies)
                captcha_code = self.ask_image_captcha(capcha_body, username)
                captcha_input.send_keys(captcha_code)
                sleep(0.5)
                driver.execute_script(butt_submit_onclick_js)

            # 可能已经登录进去了 这个element不在了
            try:
                if driver.find_element_by_class_name(
                        'page-form-item-controls').is_displayed():
                    err_message = driver.find_element_by_xpath(
                        '//label[@class="control-text error-msg"]').text
                    yield from self.error_handle(username,
                                                 "招商银行---登录失败:%s" %
                                                 (err_message),
                                                 tell_msg=err_message)
                    return
            except WebDriverException:
                pass
            sleep(1)

            try:
                # 有可能没有验证手机,之前试过多次发短信验证码,第二天神奇的不用验证短信了。。
                # self.wait_xpath(driver, '//input[@id="btnSendCode"]')
                sleep(1)
                btnSendCode = driver.find_element_by_id('btnSendCode')
                self.element_click_three_times(btnSendCode)
                sms_code = self.ask_sms_captcha(username)
                sms_code_input = driver.find_element_by_name('txtSendCode')
                sms_code_input.send_keys(sms_code)
                sleep(1)
                submit = driver.find_element_by_id('btnVerifyCode')
                self.element_click_three_times(submit)
                sleep(2)
                try:
                    have_error = driver.find_element_by_class_name(
                        'control-explain').is_displayed()
                except WebDriverException:
                    have_error = False

                if have_error:
                    err_message = driver.find_element_by_xpath(
                        '//p[@class="control-explain"]').text
                    yield from self.error_handle(username,
                                                 "招商银行---登录失败:%s" %
                                                 (err_message),
                                                 tell_msg=err_message)
                    return
            except NoSuchElementException:
                pass

            # 站点地图点击 (以下未使用执行js 执行js会直接close)
            cookies = get_cookies_dict_from_webdriver(driver)
            client_no = driver.find_element_by_name('ClientNo').get_attribute(
                'value')
            data = {
                'ClientNo': client_no,
                'AuthName':
                '<AuthName>CBANK_DEBITCARD_ACCOUNTMANAGER</AuthName>'
            }
            meta['client_no'] = client_no
            meta["cookies"] = cookies
            yield FormRequest(url=self.apply_token_url,
                              callback=self.parse_token,
                              headers=self.headers,
                              formdata=data,
                              cookies=cookies,
                              meta=meta,
                              dont_filter=True,
                              errback=self.err_callback)
            # 以下为之前的webdriver点击方式
            # if 0 != 0:
            #     self.element_click_three_times(driver.find_element_by_xpath('//a[3]'))
            #     # func_map_onclick_js = driver.find_element_by_xpath('//a[3]').get_attribute('onclick')
            #     # driver.execute_script(func_map_onclick_js)
            #     frame = driver.find_element_by_id('mainWorkArea')
            #     # 转到 主工作区 iframe 里面
            #     driver.switch_to.frame(frame)
            #     # 站点地图 获取余额
            #     btn_account_summary = driver.find_element_by_xpath('//div[contains(text(),"账户总览")]|'
            #                                                     '//DIV[contains(text(),"账户总览")]')
            #     self.element_click_three_times(btn_account_summary)
            #     # self.wait_xpath(driver,'//span[@id="lblSumOfMoney"]')
            #     balance = driver.find_element_by_id('lblSumOfMoney').text
            #     if balance:
            #         item['balance'] = balance
            #     # 返回
            #     driver.switch_to.default_content()
            #     # 站点地图再次点击
            #     self.element_click_three_times(driver.find_element_by_xpath('//a[3]'))
            #     frame = driver.find_element_by_id('mainWorkArea')
            #     # 又转到 主工作区 iframe 里面
            #     driver.switch_to.frame(frame)
            #     # 站点地图 历史交易
            #     btn_history_tran = driver.find_element_by_xpath('//div[contains(text(),"历史交易查询")]|'
            #                                                     '//DIV[contains(text(),"历史交易查询")]')
            #     self.element_click_three_times(btn_history_tran)
            #     # btn_history_tran_onclick_js = btn_history_tran.get_attribute('onclick')
            #     # driver.execute_script(btn_history_tran_onclick_js)
            #     self.wait_xpath(driver, '//input[@id="EndDate"]')
            #
            #     # 结束日期如:20170621   开始日期201606021
            #     end_date = int(driver.find_element_by_name('EndDate').get_attribute('value'))
            #     start_date = end_date - 10000
            #     start_date_input = driver.find_element_by_name('BeginDate')
            #     start_date_input.clear()
            #     start_date_input.send_keys(start_date)
            #     btnOK = driver.find_element_by_name('BtnOK')
            #     sleep(0.5)
            #     # btnOK_onclick_js = btnOK.get_attribute('onclick')
            #     # driver.execute_script(btnOK_onclick_js)
            #     self.element_click_three_times(btnOK)
            #     # OutCount 是支出交易笔数, 这个出来了table也出来了
            #     self.wait_xpath(driver, '//span[@id="OutCount"]')
            #
            #     history_records_table = Selector(text=driver.page_source).xpath('//td[@align="left"]/text()|'
            #                                                                     '//td[@align="middle"]/text()|'
            #                                                                     '//td[@align="right"]/text()').extract()
            #     # 每七个为一条记录
            #     group_list_split = 7
            #     trade_records = item["trade_records"]
            #     titles = ['trade_accounting_date', 'trade_date', 'trade_outcome', 'trade_income',
            #               'trade_balance', 'trade_type', 'trade_remark', 'trade_amount']
            #     for record in (history_records_table[i:i + group_list_split] for i in
            #                    range(0, len(history_records_table), group_list_split)):
            #         record[0] = record[0].strip()
            #         record[1] = record[0] + record[1].strip()
            #         record[2] = record[2].replace('-', '').strip()
            #         record[3] = record[3].replace('-', '').strip()
            #         record[4] = record[4].strip()
            #         record.append(record[3] or ("-" + record[2]))
            #         trade_records.append(dict(zip(titles, record)))
            #
            #     if 'balance' not in item and trade_records:
            #         item['balance'] = trade_records[-1].get('trade_balance', 0)
            #
            #     yield from self.crawling_done(item)
        except CaptchaTimeout:
            yield from self.error_handle(username,
                                         "招商银行---等待验证码超时",
                                         tell_msg="等待验证码超时,请刷新页面重试。。")
        except Exception:
            yield from self.except_handle(username,
                                          "招商银行---解析失败",
                                          tell_msg="解析失败")
        finally:
            driver.quit()
Example #21
0
    def parse(self, response):
        meta = response.meta
        item = meta["item"]
        user_name = item['username']

        driver = self.load_page_by_webdriver(self.login_url, '//input[@name="qy_sut"]')
        pid = driver.iedriver.process.pid  # webdriver进程PID
        try:
            butt_submit = driver.find_element_by_name('qy_sut')
            # username = driver.find_element_by_name('alias')
            # username.send_keys(user_name)

            with Ddxoft(pid) as dd_opt:
                driver.execute_script('username_input=document.getElementById("alias");'
                                      'username_input.value="{0}";'
                                      'username_input.focus()'.format(user_name))
                dd_opt.dd_tab()
                sleep(0.5)
                for i in item['password']:
                    dd_opt.dd_keyboard(i)
                    sleep(0.1)

            if driver.find_element_by_id('validatePwd').is_displayed():
                captcha_url = 'https://sbank.hxb.com.cn/easybanking/validateservlet'
                capcha_cookies = get_cookies_dict_from_webdriver(driver)
                capcha_body = get_content_by_requests(captcha_url, headers=self.headers,
                                                      cookie_jar=capcha_cookies)
                captcha_code = self.ask_image_captcha(capcha_body, user_name)
                driver.execute_script('document.getElementById("verifyCode").value="{0}"'.format(captcha_code))
                sleep(0.5)
                if 'validateNoError' in driver.find_element_by_id('valdtErr').get_attribute('src'):
                    yield from self.error_handle(user_name,
                                                 "华夏银行---登录失败:(username:%s, password:%s) %s"
                                                 % (user_name, item["password"], '验证码输入错误'),
                                                 tell_msg='验证码错误')
                    return
            self.element_click_three_times(butt_submit)
            # driver.execute_script(butt_submit.get_attribute('onclick'))
            sleep(2)
            try:
                error_message = driver.find_element_by_id('mess')
                message = error_message.text
                yield from self.error_handle(user_name,
                                             "华夏银行---登录失败:(username:%s, password:%s) %s"
                                             % (user_name, item["password"], message),
                                             tell_msg=message)
                return
            except NoSuchElementException:
                pass

            self.element_click_three_times(driver.find_element_by_class_name('main_nav_1'))
            # my_account_onclick_js = driver.find_element_by_class_name('main_nav_1').get_attribute('onclick')
            # driver.execute_script(my_account_onclick_js)
            sleep(0.5)
            # account_detail_onclick_js = driver.find_element_by_xpath('//a[contains(text(),"账户明细查询")]').get_attribute('onclick')
            # driver.execute_script(account_detail_onclick_js)
            self.element_click_three_times(driver.find_element_by_xpath('//a[contains(text(),"账户明细查询")]'))
            from_day = (date.today() - self.date_delta).strftime("%Y%m%d")
            driver.execute_script('document.getElementsByName("queryStrDateYear")[0].value="%s";'
                                  'document.getElementsByName("queryStrDateMonth")[0].value="%s";'
                                  'document.getElementsByName("queryStrDateDay")[0].value="%s";'
                                  % (from_day[:4], from_day[4:6], from_day[6:8]))

            self.element_click_three_times(driver.find_element_by_id('form_submit'))
            sleep(0.5)
            page_source = driver.page_source
            result = self.__get_page_detail(page_source)
            for i in range(100):
                try:
                    pageDown_btn = driver.find_element_by_xpath('//a[contains(text(),"下一页")]')
                    self.element_click_three_times(pageDown_btn)
                    sleep(1)
                    page_source = driver.page_source
                    result.extend(self.__get_page_detail(page_source))
                except Exception:
                    break
                try:
                    pageDown_btn = driver.find_element_by_xpath('//a[contains(text(),"下一页")]')
                    if pageDown_btn.get_attribute('disabled') == 'disabled':
                        break
                except NoSuchElementException:
                    break

            item['trade_records'] = result

            if 'balance' not in item and result:
                item['balance'] = result[0].get('trade_balance', 0)

            yield from self.crawling_done(item)
        except CaptchaTimeout:
            yield from self.error_handle(user_name, "华夏银行---等待验证码超时",
                                         tell_msg="等待验证码超时,请刷新页面重试。。")
        except Exception:
            yield from self.except_handle(user_name, "华夏银行---解析失败", tell_msg="解析失败")
        finally:
            driver.quit()
Example #22
0
 def parse_login(self, response):
     meta = response.meta
     item = meta["item"]
     # self.logger.debug(response.request.body.decode())
     # self.logger.debug('header ' + str(response.headers))
     if response.status != 302:
         if response.xpath('//div[@id="status"]/text()').extract_first():
             yield from self.error_handle(
                 item["username"],
                 "%s 账号或密码错误" % item["username"],
                 tell_msg=response.xpath(
                     '//div[@id="status"]/text()').extract_first())
             return
         if response.xpath('//input[@id="captcha"]').extract_first() \
                 or response.xpath('//div[@class="ct_input errors"]').extract_first():
             meta['captcha_retry_time'] -= 1
             if meta['captcha_retry_time'] < 0:
                 yield from self.error_handle(item["username"],
                                              "%s 图片验证码请求五次,退出" %
                                              item["username"],
                                              tell_msg='验证码已刷新五次,请重试')
                 return
             lt = response.xpath(
                 '//input[@name="lt"]/@value').extract_first("")
             cookiejar = get_cookiejar_from_response(response)
             url = "https://account.chsi.com.cn/passport/captcha.image?id=" + str(
                 random())
             headers = meta['headers']
             captcha_body = get_content_by_requests(url,
                                                    headers,
                                                    cookie_jar=cookiejar)
             captcha_code = self.ask_image_captcha(captcha_body,
                                                   item['username'],
                                                   file_type=".jpeg")
             req_data = self.get_req_data(self.user_login,
                                          user_name=item["username"],
                                          password=item["password"],
                                          lt=lt,
                                          captcha=captcha_code)
             try:
                 headers['Cookie'] = response.headers.get(
                     'Set-Cookie').decode()
             except Exception:
                 pass
             self.logger.debug(req_data)
             self.logger.debug(headers)
             r = FormRequest(headers=headers,
                             url=self._start_url_,
                             callback=self.parse_login,
                             formdata=req_data,
                             meta=meta,
                             errback=self.err_callback,
                             dont_filter=True)
             yield r
         else:
             yield from self.error_handle(item["username"],
                                          "%s 账号或密码错误" % item["username"],
                                          tell_msg='账号或密码错误')
             return
     else:
         try:
             get_jsession_url = response.headers.get('Location')
             if get_jsession_url:
                 get_jsession_url = get_jsession_url.decode()
                 self.logger.info("请求获取sessionid接口->%s" % get_jsession_url)
                 headers = meta['headers']
                 headers['Referer'] = response.url
                 yield Request(headers=meta['headers'],
                               url=get_jsession_url,
                               callback=self.parse_getJsession,
                               meta=meta,
                               errback=self.err_callback,
                               dont_filter=True)
             else:
                 yield from self.error_handle(item["username"],
                                              "%s 账号或密码错误" %
                                              item["username"],
                                              tell_msg='账号或密码错误')
         except Exception:
             yield from self.except_handle(item["username"],
                                           "学信网---登录数据解析异常")