def parse_get_xj(self, response): """ 学籍 """ meta = response.meta item = meta["item"] archive_jsessionid = meta["archive_jsessionid"] try: xj_dict = {} for table_info in response.xpath('//div[@class="clearfix"]'): xj_img = table_info.xpath( './/img[@class="xjxx-img"]/@src').extract_first("") xj_info_pic_data = get_content_by_requests( xj_img, headers=meta['headers'], cookie_jar={'JSESSIONID': archive_jsessionid}) xj_info_dict = self.pic_orc(xj_info_pic_data) xj_dict[self._get_xjxl_key(xj_info_dict)] = xj_info_dict meta['xj_dict'] = xj_dict yield Request(headers=meta['headers'], cookies={'JSESSIONID': archive_jsessionid}, url=self.gdjy_xl_url, callback=self.parse_get_xl, meta=meta, dont_filter=True, errback=self.err_callback) except Exception: yield from self.except_handle(item["username"], "学信网---提取学籍信息数据解析异常")
def get_captcha_code(self, response): """ 获取验证码并识别,返回识别的验证码 """ headers = get_headers_from_response(response) sleep_time = self.settings.get("DOWNLOAD_DELAY", 0.3) captcha_id = self.captcha_id _get_captcha_code = self._get_captcha_code while True: form_data = { "captchaId": captcha_id, "random": str(rand_0_1()), } try: captcha_body = get_content_by_requests( "http://zhixing.court.gov.cn/search/captcha.do?" + urlencode(form_data), headers=headers) except Exception: sleep(sleep_time) continue if captcha_body.startswith(b"<"): self.logger.error("被执行人---验证码:请开启JavaScript并刷新该页") sleep(self.sleep_time) continue captcha_code = _get_captcha_code(captcha_body) if len(captcha_code) == 4: return captcha_code sleep(sleep_time)
def http_request(self, url, method="GET", data=None, headers=None, cookies=None, to_json=False, get_str=True, charset="utf-8", get_cookies=False): """ 封装HTTP请求 :param url: :param data: :param method: :param headers: :param cookies: :param to_json: :param get_str: :param charset: :param get_cookies: :return: """ try: cookies_dic = {} if headers is None: headers = self.headers or {} if cookies is None: cookies = self.cookies or {} if isinstance(cookies, list): cookies = {cookie["name"]: cookie["value"] for cookie in cookies} if method == "GET": if get_cookies: resp = get_response_by_requests(url, headers=headers, cookie_jar=cookies) cookies_dic = resp.cookies.get_dict() content = resp.content else: content = get_content_by_requests(url, headers=headers, cookie_jar=cookies) elif method == "POST": if get_cookies: resp = get_response_by_requests_post(url, headers=headers, cookie_jar=cookies) cookies_dic = resp.cookies.get_dict() content = resp.content else: content = get_content_by_requests_post(url, data=data, headers=headers, cookie_jar=cookies) else: self.logger.error("暂不支持该请求方法") return if not get_str: if get_cookies: return {"result": content, "cookies": cookies_dic} return content page = content.decode(charset) if not to_json: if get_cookies: return {"result": page, "cookies": cookies_dic} return page if get_cookies: return {"result": json_loads(page), "cookies": cookies_dic} return json_loads(page) except Exception: self.logger.exception("请求出错: url:%s" % url) return
def send_pic_get_ponint6_to_ssdb(self, img_url, img_desc, username): captcha_body = get_content_by_requests(img_url, self.headers) captcha_code = self.ask_image_captcha(captcha_body, username, file_type=".jpeg", image_describe=img_desc) code = captcha_code.split(',') return code[0], code[1], code[2], code[3], code[4], code[5]
def _get_callDetail_image_captcha(self, response, username): cookiejar = get_cookiejar_from_response(response) headers = get_headers_from_response(response) url = "http://shop.10086.cn/i/authImg?t=" + str(rand_0_1()) self.set_image_captcha_headers_to_ssdb(headers, username) return get_content_by_requests(url, headers=headers, cookie_jar=cookiejar)
def user_login(self, response): meta = response.meta item = meta['item'] username = item['username'] try: login_js = json_loads(response.text) retcode = login_js['retcode'] if retcode == '4049': # 发送验证码 headers = get_headers_from_response(response) captcha_url = meta['captcha_url'] self.set_image_captcha_headers_to_ssdb( headers, username) # 将头信息传递给 django self.set_email_img_url_to_ssdb(captcha_url, username) captcha_body = get_content_by_requests(captcha_url, headers=headers) captcha_code = self.ask_image_captcha(captcha_body, username, file_type=".png") meta["captecha_code"] = { 'door': captcha_code, 'pcid': meta['pcid'] } # 异地登录 需要验证码验证 su = self._enb64(self._url_encode(username)).decode() step1_url = 'http://login.sina.com.cn/sso/prelogin.php?entry=sso' \ '&callback=sinaSSOController.preloginCallBack' \ '&su=%s&rsakt=mod&client=ssologin.js(v1.4.19)' % su yield Request(url=step1_url, callback=self.step1, meta=meta, dont_filter=True, errback=self.err_callback) elif retcode in ['101', '2070', '2079']: err_message = '登录名或密码错误!' yield from self.error_handle( username, msg="sina---登录失败:(username:%s, password:%s) %s" % (username, item['password'], err_message), tell_msg=err_message) else: meta["cross"] = login_js['crossDomainUrlList'][0] yield Request(url=login_js['crossDomainUrlList'][1], callback=self.cross_domain_one, meta=meta, dont_filter=True, errback=self.err_callback) except Exception: yield from self.except_handle( username, msg="用户登录解析异常", tell_msg="邮箱登录失败,请刷新重试", logout_request=self.get_logout_request(meta))
def download_pic(self, url): """ 下载验证码图片 :param url: :return: """ try: return get_content_by_requests(url, headers=self.headers) except Exception: self.logger.exception("下载验证码图片失败") return None
def _get_captcha_code_by_phantomJS(self, driver, username): url = "http://m.pingan.com/t/ImageGif.do?v=2&rd=%s&imageNum=359" % random( ) cookiejar = get_cookies_dict_from_webdriver(driver) captcha_body = get_content_by_requests(url, self.headers, cookie_jar=cookiejar) captcha_code = self.ask_image_captcha(captcha_body, username, file_type=".jpeg") return captcha_code
def parse(self, response): headers = self.headers.copy() meta = response.meta meta['headers'] = headers meta['captcha_retry_time'] = 5 item = meta["item"] item['xueli'] = [] if response.status == 302: yield from self.parse_login(response) else: try: self.logger.info("请求登录接口->%s" % self.user_login) lt = response.xpath( '//input[@name="lt"]/@value').extract_first("") captcha_code = None # self.logger.debug('captcha1 ' + str(response.xpath('//input[@id="captcha"]').extract_first(""))) # self.logger.debug('captcha2 ' + str(response.xpath('//div[@class="ct_input errors"]').extract_first(""))) if response.xpath('//input[@id="captcha"]').extract_first() \ or response.xpath('//div[@class="ct_input errors"]').extract_first(): meta['captcha_retry_time'] -= 1 cookiejar = get_cookiejar_from_response(response) url = "https://account.chsi.com.cn/passport/captcha.image?id=" + str( random()) captcha_body = get_content_by_requests( url, headers, cookie_jar=cookiejar, proxies={ "https": response.meta['proxy'], "http": response.meta['proxy'] }) captcha_code = self.ask_image_captcha(captcha_body, item['username'], file_type=".jpeg") req_data = self.get_req_data(self.user_login, user_name=item["username"], password=item["password"], lt=lt, captcha=captcha_code) self.logger.debug(req_data) headers['Cookie'] = response.headers.get('Set-Cookie').decode() headers['Referer'] = self._start_url_ r = FormRequest(headers=headers, url=self._start_url_, callback=self.parse_login, formdata=req_data, meta=meta, errback=self.err_callback, dont_filter=True) yield r except Exception: yield from self.except_handle(meta["item"]["username"], "学信网---爬虫解析入口异常")
def _verify_callDetail_captcha(self, response, username, captcha_code): """ 返回是否正确 """ cookiejar = get_cookiejar_from_response(response) headers = get_headers_from_response(response) url = "http://shop.10086.cn/i/v1/res/precheck/" + username + "?captchaVal=" \ + captcha_code + "&_=" + get_js_time() info = get_content_by_requests(url, headers=headers, cookie_jar=cookiejar) return b'"retCode":"000000"' in info
def verify_captcha(self, response, captcha_code): """ 移动有一个url可以在不提交表单的情况下,先检查验证码是否正确 返回是否正确 """ cookiejar = get_cookiejar_from_response(response) headers = get_headers_from_response(response) url = b"https://login.10086.cn/verifyCaptcha?inputCode=" \ + captcha_code.encode('unicode-escape', "ignore").replace(b"\\u", b"") info = get_content_by_requests(url, headers=headers, cookie_jar=cookiejar) return b'"resultCode":"0"' in info
def parse_get_xl(self, response): meta = response.meta item = meta["item"] xj_dict = meta['xj_dict'] try: xueli = item['xueli'] archive_jsessionid = meta["archive_jsessionid"] for table_info in response.xpath('//div[@class="clearfix"]'): # 学历信息中的毕业证照片链接 url = table_info.xpath( './/div[@class="pic"]/img/@src').extract_first() if url and 'no-photo' not in url: pic_data = get_content_by_requests( 'https://my.chsi.com.cn' + url, headers=meta['headers'], cookie_jar={'JSESSIONID': archive_jsessionid}) else: pic_data = b'' xl_img = table_info.xpath( './/img[@class="xjxx-img"]/@src').extract_first("") xl_info_pic_data = get_content_by_requests( xl_img, headers=meta['headers'], cookie_jar={'JSESSIONID': archive_jsessionid}) xl_info_dict = self.pic_orc(xl_info_pic_data) xl_info_dict['photo'] = pic_data key = self._get_xjxl_key(xl_info_dict) if key in xj_dict: xueli.append( self.merge_dict(xj_dict.pop(key), xl_info_dict)) else: xueli.append(xl_info_dict) xueli.extend(xj_dict.values()) yield from self.crawling_done(item) except Exception: yield from self.except_handle(item["username"], "学信网---提取学历信息数据解析异常")
def parse(self, response): item = response.meta["item"] username = item["username"] password = item["password"] url = self.login_url + get_js_time() + "?returnURL=account%2Findex%2FtransferList" driver = self.load_page_by_webdriver(url, "//div[@id='pwdObject1-btn-pan']") try: try: if driver.find_element_by_id("verifyCode").is_displayed(): url = "https://bank.pingan.com.cn/ibp/portal/pc/getVcode2.do?" + get_js_time() cookiejar = get_cookies_dict_from_webdriver(driver) capcha_body = get_content_by_requests(url, headers=self.headers, cookie_jar=cookiejar) captcha_code = self.ask_image_captcha(capcha_body, username) driver.execute_script('document.getElementById("verifyCode").value="' + captcha_code + '";') except CaptchaTimeout: raise except Exception: self.logger.exception("平安银行---图片验证码") # 填写用户名和密码 driver.execute_script('document.getElementById("pwdObject1-btn-pan").click();' 'document.getElementById("userName").value="%s";' 'document.getElementById("pwdObject1-input").value="%s";' 'document.getElementById("login_btn").click();' % (username, password)) try: self.wait_xpath(driver, "//li[@id='safe_logout']", raise_timeout=True, timeout=6) except TimeoutException: page_source = driver.page_source if "密码错" in page_source: yield from self.error_handle(username, "平安银行---登录", driver.find_element_by_xpath("//span[@id='errorLoginMsg']").text) elif "证码错" in page_source: yield from self.error_handle(username, "平安银行---登录", driver.find_element_by_xpath("//span[@id='verifyError']").text) else: yield from self.error_handle(username, "平安银行---登录异常(%s)" % page_source, "登录失败") else: item["balance"], item["trade_records"] = self._login_success(driver) yield from self.crawling_done(item) except CaptchaTimeout: yield from self.error_handle(username, "平安银行---等待验证码超时", tell_msg="等待验证码超时,请刷新页面重试。。") except Exception: yield from self.except_handle(username, "平安银行---爬取", "爬取异常") finally: driver.quit()
def check_need_sms_captcha(self, response, username): cookiejar = get_cookiejar_from_response(response) headers = get_headers_from_response(response) form_data = { "accountType": "01", "account": username, "timestamp": get_js_time(), # "pwdType": "02", } info = get_content_by_requests( "https://login.10086.cn/needVerifyCode.htm?" + urlencode(form_data), headers=headers, cookie_jar=cookiejar) return b'"needVerifyCode":"1"' in info
def verify_captcha(self, response, captcha_code, cookies_dict): """ 联通有一个url可以在不提交表单的情况下,先检查验证码是否正确 返回是否正确 """ headers = get_headers_from_response(response) the_time = get_js_time() form_data = { 'verifyCode': captcha_code, 'verifyType': "1", '_': int(the_time) + 1, 'callback': "jQuery1720" + str(randint(1E16, 1E17 - 1)) + "_" + the_time } url = "https://uac.10010.com/portal/Service/CtaIdyChk?" + urlencode( form_data) info = get_content_by_requests(url, headers=headers, cookie_jar=cookies_dict) return b'"resultCode":"true"' in info
def parse(self, response): meta = response.meta item = meta["item"] username = item['username'] driver = self.load_page_by_webdriver(response.url) pid = driver.iedriver.process.pid try: sleep(3) with Ddxoft(pid) as dd_opt: driver.execute_script( 'document.getElementsByName("logonNoCert")[0].value="{0}";' 'document.getElementsByName("logonNoCert")[0].focus()'. format(username)) dd_opt.dd_tab() sleep(0.1) for i in item['password']: dd_opt.dd_keyboard(i) sleep(0.1) if driver.find_element_by_id('verifyId').is_displayed(): captcha_input = driver.find_element_by_name('verifyCode') captcha_url = driver.find_element_by_id( 'pinImg').get_attribute('src') capcha_cookies = get_cookies_dict_from_webdriver(driver) capcha_body = get_content_by_requests( captcha_url, headers=self.headers, cookie_jar=capcha_cookies) captcha_code = self.ask_image_captcha(capcha_body, username) captcha_input.send_keys(captcha_code) sleep(0.5) # 验证码正误弹窗 try: driver.switch_to.alert.accept() except Exception: pass if 'stop_button' in driver.find_element_by_id( 'verifyImg').get_attribute('src'): yield from self.error_handle( username, "中信银行---登录失败:(username:%s, password:%s) %s" % (username, item["password"], '验证码输入错误'), tell_msg='验证码错误') return butt_submit = driver.find_element_by_id('logonButton') butt_submit_onclick_js = butt_submit.get_attribute('onclick') driver.execute_script(butt_submit_onclick_js) # butt_submit.click() sleep(2) try: errorReason = driver.find_element_by_class_name('errorReason') message = errorReason.text yield from self.error_handle( username, "中信银行---登录失败:(username:%s, password:%s) %s" % (username, item["password"], message), tell_msg=message) return except NoSuchElementException: pass # if driver.find_element_by_id('firstLogonMdyPwdID').is_displayed(): # 出现这个元素要重置密码 # # 只出现了一次 不知道是啥样了。先记在这里 # pass ''' 跳过修改密码''' if driver.find_elements_by_xpath('//a[contains(text(),"跳过")]'): jump_js = (driver.find_element_by_xpath( '//a[contains(@onclick,"jumpTip")]' ).get_attribute('onclick') or driver.find_element_by_xpath( '//a[contains(text(),"跳过")]').get_attribute('onclick')) driver.execute_script(jump_js) if driver.find_element_by_id('jumpTipDiv').is_displayed(): jump_ok_js = driver.find_element_by_id( 'jump').get_attribute('onclick') driver.execute_script(jump_ok_js) ''' 短信验证 (突然又不出现了暂未测完全)''' if driver.find_elements_by_name('mdpBtn'): get_sms_code_js = driver.find_element_by_name( 'mdpBtn').get_attribute('onclick').replace( 'javascript:', '') # check_ok_js = driver.find_element_by_id('checkId').get_attribute('onclick').replace('javascript:', '') btn_next_js = driver.find_element_by_id( 'nextStep').get_attribute('onclick').replace( 'javascript:', '') max_loop_time = 10 while not driver.find_element_by_id( 'checkId').is_selected() and max_loop_time > 0: driver.find_element_by_id('checkId').click() max_loop_time -= 1 driver.execute_script(get_sms_code_js) sms_code = self.ask_sms_captcha(username) driver.execute_script( 'document.getElementsByName("mobilDynPwdStr1")[0].value="{0}";' .format(sms_code)) sleep(1) driver.execute_script(btn_next_js) if driver.find_elements_by_xpath('//dl/dd/b'): err_message = driver.find_element_by_xpath( '//dl/dd/b').text yield from self.error_handle(username, "中信银行---短信验证码出错:%s " % err_message, tell_msg=err_message) return self.wait_xpath(driver, '//input[@id="searchItemId"]') EMP_SID = self.EMP_SID_pattern.search( driver.find_element_by_id('formLogout').get_attribute( 'action')) if EMP_SID: EMP_SID = EMP_SID.group(1) else: driver.execute_script( 'document.getElementById("searchItemId").value="账户查询";') driver.find_element_by_id('searchItemId').send_keys(Keys.ENTER) driver.find_element_by_id('searchItemId').send_keys(Keys.ENTER) driver.find_element_by_id('searchItemId').send_keys(Keys.ENTER) sleep(1) mainframe = driver.find_element_by_id('mainframe') driver.switch_to.frame(mainframe) try: EMP_SID = self.EMP_SID_pattern.search( driver.page_source).group(1) except AttributeError: yield from self.error_handle(username, "中信银行---解析失败:EMP_SID 获取失败", tell_msg='解析出错!') return meta["EMP_SID"] = EMP_SID cookies = get_cookies_dict_from_webdriver(driver) yield FormRequest(url=self.account_balance_url.format(EMP_SID), callback=self.parse_balance, headers=self.headers, cookies=cookies, meta=meta, dont_filter=True, errback=self.err_callback) except CaptchaTimeout: yield from self.error_handle(username, "中信银行---等待验证码超时", tell_msg="等待验证码超时,请刷新页面重试。。") except Exception: yield from self.except_handle(username, "中信银行---解析失败", tell_msg="解析失败") finally: driver.quit()
def _get_pic_by_requests(self, driver, url): cookiejar = get_cookies_dict_from_webdriver(driver) return get_content_by_requests(url, self.headers, cookie_jar=cookiejar)
def parse(self, response): item = response.meta["item"] username = item["username"] driver = self.getdriver( executable_path=self.settings["IE_EXECUTABLE_233_PATH"], browser_type="IE") pid = driver.iedriver.process.pid try: wait = ui.WebDriverWait(driver, 20) driver.get(response.url) wait.until(lambda dr: dr.find_element_by_id("loginButton")) driver.maximize_window() with Ddxoft(pid) as visual_keyboard: driver.execute_script( "username_input=document.getElementById('writeUserId');" "username_input.value='%s';" "username_input.focus();" % username) visual_keyboard.dd_tab() sleep(0.1) for key in item["password"]: visual_keyboard.dd_keyboard(key) sleep(0.1) # 检查是否需要输入验证码 captcha_input = driver.find_element_by_id("_vTokenName") if captcha_input and captcha_input.is_displayed(): # 验证码 captcha_url = "https://nper.cmbc.com.cn/pweb/GenTokenImg.do?random=" + str( random()) capcha_cookies = get_cookies_dict_from_webdriver(driver) capcha_body = get_content_by_requests( captcha_url, headers=self.headers, cookie_jar=capcha_cookies) captcha_code = self.ask_image_captcha(capcha_body, username) driver.execute_script( "document.getElementById('_vTokenName').value='" + captcha_code + "';") driver.execute_script( "document.getElementById('loginButton').click();") sleep(2) curr_url = driver.current_url if 'main.html' not in curr_url: # 验证是否登录成功 error = driver.find_element_by_id('jsonError').text yield from self.error_handle( username, msg="民生银行---登录失败:(username:%s, password:%s) %s" % (username, item['password'], '账号密码错误'), tell_msg=error) return # wait.until(lambda dr: dr.find_element_by_id('welcomeMainContent')) wait.until(lambda dr: dr.find_elements_by_xpath( "//form[@id='welcomeMainContent']" "//a[text()='查询明细']")) # balance = driver.find_element_by_xpath('//div[@class="sy_m1_x v-scope"]/div/span').text item['balance'] = driver.find_element_by_xpath( '//div[@class="sy_m1_x v-scope"]' '/div/span[@class="v-binding"]').text # 跳转到银行流水界面 driver.execute_script( 'document.getElementById("welcomeMainContent")' '.getElementsByClassName("yuanbj")[2].click()') wait.until( lambda dr: dr.find_element_by_id("QuickTitle").is_displayed()) end_date = date.today() trade_records = item["trade_records"] for i in range(4): # 只能在3个月的跨度里查询 start_date = self.get_theday_3_month_ago(end_date) try: wait.until(lambda dr: dr.find_elements_by_xpath( "//input[@v-model='BeginDate']")) # 选择开始日期 driver.execute_script( '$("input[v-model=BeginDate]").focus();') sleep(0.1) driver.execute_script( '$("select[data-handler=selectYear]").val("%s");' '$("select[data-handler=selectYear]").change();' % start_date.year) sleep(0.1) driver.execute_script( '$("select[data-handler=selectMonth]").val("%s");' '$("select[data-handler=selectMonth]").change();' % (start_date.month - 1)) sleep(0.1) day = start_date.day driver.execute_script(''' $("table.ui-datepicker-calendar").find("a:contains(%s)").each(function(){ if ($(this).text() == "%s" && $(this).attr("class").indexOf("ui-priority-secondary") == -1) { this.click(); } });''' % (day, day)) sleep(1) if 'display: none' in driver.find_element_by_id( "jsonErrorShow").get_attribute("style"): wait.until(lambda dr: dr.find_element_by_id( "DataTable").is_displayed()) # json_data = {"AcNo":"6226220681208897","BankAcType":"03","BeginDate":start_date.strftime("%Y-%m-%d"),"EndDate":end_date.strftime("%Y-%m-%d"),"AcName":u"文学","Remark":"-","Fee":"0.00","FeeRemark":"-","Ten":"Ten","SubAcSeq":"0001","currentIndex":0,"uri":"/pweb/ActTrsQry.do"} # _cookies = get_cookies_dict_from_webdriver(driver) # response = requests.post("https://nper.cmbc.com.cn/pweb/ActTrsQry.do", headers=self.headers, # json=json_data, cookies=_cookies, verify=False) # response_text = response.text trade_records.extend( self.parse_item_from_webpage(driver, wait)) except Exception: self.logger.exception("民生银行---时间筛选") end_date = start_date yield from self.crawling_done(item) except CaptchaTimeout: yield from self.error_handle(username, "民生银行---等待验证码超时", tell_msg="等待验证码超时,请刷新页面重试。。") except Exception: yield from self.except_handle(username, msg="民生银行---错误信息:", tell_msg="银行流水数据爬取失败,请刷新页面重试!") finally: driver.quit()
def parse(self, response): item = response.meta["item"] username = item["username"] driver = self.load_page_by_webdriver(response.url, "//input[@id='USERID']") try: # 因为密码控件有javascript加密调用,所以必须用send_keys driver.find_element_by_id("LOGPASS").send_keys(item["password"]) driver.execute_script( 'document.getElementById("LOGPASS").blur();' 'user_input=document.getElementById("USERID");' 'user_input.focus();' 'user_input.value="' + username + '";') # 验证码 try: driver.find_element_by_id("PT_CONFIRM_PWD") except NoSuchElementException: pass else: captcha_url = driver.find_element_by_id( "fujiama").get_attribute("src") cookiejar = get_cookies_dict_from_webdriver(driver) capcha_body = get_content_by_requests(captcha_url, headers=self.headers, cookie_jar=cookiejar) captcha_code = self.ask_image_captcha(capcha_body, username) driver.execute_script( 'document.getElementById("PT_CONFIRM_PWD").value="' + captcha_code + '";') driver.execute_script( 'document.getElementById("loginButton").click();') # =================================登录结束================================= # # 点击明细 iframe_xpath = "//div[@id='w3']/iframe" self.wait_xpath(driver, iframe_xpath) driver.switch_to.frame(driver.find_element_by_xpath(iframe_xpath)) self.wait_xpath(driver, "//span[@data_id='mingxi']") html = find_str_range(driver.page_source, '<div class="card_list"', "/ul>") bs_obj = BeautifulSoup(html, "lxml") mingxi_values = bs_obj.find("span", {"data_id": "mingxi"})["values"] driver.execute_script("""$("span[values='%s']")[0].click()""" % mingxi_values) driver.switch_to.window(driver.window_handles[1]) driver.switch_to.frame("sear") end_date = date.today() start_date_str = (end_date - self.date_delta).strftime("%Y%m%d") end_date_str = end_date.strftime("%Y%m%d") driver.execute_script( 'document.getElementById("START_DATE").value="%s";' 'document.getElementById("END_DATE").value="%s";' 'toSqOrQd("1");' % (start_date_str, end_date_str)) self.wait_xpath(driver, "//iframe[@id='result']") item["trade_records"] = self._download_trade_records( driver, start_date_str, end_date_str, mingxi_values) yield from self.crawling_done(item) except CaptchaTimeout: yield from self.error_handle(username, "建设银行---等待验证码超时", tell_msg="等待验证码超时,请刷新页面重试。。") except Exception: yield from self.except_handle(username, "建设银行----爬取异常:", tell_msg="爬取建设银行账户流水失败") finally: driver.quit()
def parse(self, response): meta = response.meta item = meta["item"] username = item['username'] driver = self.load_page_by_webdriver(response.url) pid = driver.iedriver.process.pid try: # 停顿2秒 准备模拟键盘输入 sleep(2) with Ddxoft(pid) as dd_opt: for i in username: dd_opt.dd_keyboard(i) sleep(0.5) dd_opt.dd_tab() sleep(0.5) for i in item['password']: dd_opt.dd_keyboard(i) butt_submit = driver.find_element_by_id('LoginBtn') # butt_submit.click() butt_submit_onclick_js = butt_submit.get_attribute( 'onclick').replace('javascript:', '') driver.execute_script(butt_submit_onclick_js) sleep(1) # 有可能有附加码(附加码在点击登录后生成 所以填完还得登一下) try: have_extra = driver.find_element_by_id( 'ImgExtraPwd').is_displayed() except WebDriverException: have_extra = False if have_extra: captcha_input = driver.find_element_by_id('ExtraPwd') captcha_url = driver.find_element_by_id( 'ImgExtraPwd').get_attribute('src') capcha_cookies = get_cookies_dict_from_webdriver(driver) capcha_body = get_content_by_requests( captcha_url, headers=self.headers, cookie_jar=capcha_cookies) captcha_code = self.ask_image_captcha(capcha_body, username) captcha_input.send_keys(captcha_code) sleep(0.5) driver.execute_script(butt_submit_onclick_js) # 可能已经登录进去了 这个element不在了 try: if driver.find_element_by_class_name( 'page-form-item-controls').is_displayed(): err_message = driver.find_element_by_xpath( '//label[@class="control-text error-msg"]').text yield from self.error_handle(username, "招商银行---登录失败:%s" % (err_message), tell_msg=err_message) return except WebDriverException: pass sleep(1) try: # 有可能没有验证手机,之前试过多次发短信验证码,第二天神奇的不用验证短信了。。 # self.wait_xpath(driver, '//input[@id="btnSendCode"]') sleep(1) btnSendCode = driver.find_element_by_id('btnSendCode') self.element_click_three_times(btnSendCode) sms_code = self.ask_sms_captcha(username) sms_code_input = driver.find_element_by_name('txtSendCode') sms_code_input.send_keys(sms_code) sleep(1) submit = driver.find_element_by_id('btnVerifyCode') self.element_click_three_times(submit) sleep(2) try: have_error = driver.find_element_by_class_name( 'control-explain').is_displayed() except WebDriverException: have_error = False if have_error: err_message = driver.find_element_by_xpath( '//p[@class="control-explain"]').text yield from self.error_handle(username, "招商银行---登录失败:%s" % (err_message), tell_msg=err_message) return except NoSuchElementException: pass # 站点地图点击 (以下未使用执行js 执行js会直接close) cookies = get_cookies_dict_from_webdriver(driver) client_no = driver.find_element_by_name('ClientNo').get_attribute( 'value') data = { 'ClientNo': client_no, 'AuthName': '<AuthName>CBANK_DEBITCARD_ACCOUNTMANAGER</AuthName>' } meta['client_no'] = client_no meta["cookies"] = cookies yield FormRequest(url=self.apply_token_url, callback=self.parse_token, headers=self.headers, formdata=data, cookies=cookies, meta=meta, dont_filter=True, errback=self.err_callback) # 以下为之前的webdriver点击方式 # if 0 != 0: # self.element_click_three_times(driver.find_element_by_xpath('//a[3]')) # # func_map_onclick_js = driver.find_element_by_xpath('//a[3]').get_attribute('onclick') # # driver.execute_script(func_map_onclick_js) # frame = driver.find_element_by_id('mainWorkArea') # # 转到 主工作区 iframe 里面 # driver.switch_to.frame(frame) # # 站点地图 获取余额 # btn_account_summary = driver.find_element_by_xpath('//div[contains(text(),"账户总览")]|' # '//DIV[contains(text(),"账户总览")]') # self.element_click_three_times(btn_account_summary) # # self.wait_xpath(driver,'//span[@id="lblSumOfMoney"]') # balance = driver.find_element_by_id('lblSumOfMoney').text # if balance: # item['balance'] = balance # # 返回 # driver.switch_to.default_content() # # 站点地图再次点击 # self.element_click_three_times(driver.find_element_by_xpath('//a[3]')) # frame = driver.find_element_by_id('mainWorkArea') # # 又转到 主工作区 iframe 里面 # driver.switch_to.frame(frame) # # 站点地图 历史交易 # btn_history_tran = driver.find_element_by_xpath('//div[contains(text(),"历史交易查询")]|' # '//DIV[contains(text(),"历史交易查询")]') # self.element_click_three_times(btn_history_tran) # # btn_history_tran_onclick_js = btn_history_tran.get_attribute('onclick') # # driver.execute_script(btn_history_tran_onclick_js) # self.wait_xpath(driver, '//input[@id="EndDate"]') # # # 结束日期如:20170621 开始日期201606021 # end_date = int(driver.find_element_by_name('EndDate').get_attribute('value')) # start_date = end_date - 10000 # start_date_input = driver.find_element_by_name('BeginDate') # start_date_input.clear() # start_date_input.send_keys(start_date) # btnOK = driver.find_element_by_name('BtnOK') # sleep(0.5) # # btnOK_onclick_js = btnOK.get_attribute('onclick') # # driver.execute_script(btnOK_onclick_js) # self.element_click_three_times(btnOK) # # OutCount 是支出交易笔数, 这个出来了table也出来了 # self.wait_xpath(driver, '//span[@id="OutCount"]') # # history_records_table = Selector(text=driver.page_source).xpath('//td[@align="left"]/text()|' # '//td[@align="middle"]/text()|' # '//td[@align="right"]/text()').extract() # # 每七个为一条记录 # group_list_split = 7 # trade_records = item["trade_records"] # titles = ['trade_accounting_date', 'trade_date', 'trade_outcome', 'trade_income', # 'trade_balance', 'trade_type', 'trade_remark', 'trade_amount'] # for record in (history_records_table[i:i + group_list_split] for i in # range(0, len(history_records_table), group_list_split)): # record[0] = record[0].strip() # record[1] = record[0] + record[1].strip() # record[2] = record[2].replace('-', '').strip() # record[3] = record[3].replace('-', '').strip() # record[4] = record[4].strip() # record.append(record[3] or ("-" + record[2])) # trade_records.append(dict(zip(titles, record))) # # if 'balance' not in item and trade_records: # item['balance'] = trade_records[-1].get('trade_balance', 0) # # yield from self.crawling_done(item) except CaptchaTimeout: yield from self.error_handle(username, "招商银行---等待验证码超时", tell_msg="等待验证码超时,请刷新页面重试。。") except Exception: yield from self.except_handle(username, "招商银行---解析失败", tell_msg="解析失败") finally: driver.quit()
def parse(self, response): meta = response.meta item = meta["item"] user_name = item['username'] driver = self.load_page_by_webdriver(self.login_url, '//input[@name="qy_sut"]') pid = driver.iedriver.process.pid # webdriver进程PID try: butt_submit = driver.find_element_by_name('qy_sut') # username = driver.find_element_by_name('alias') # username.send_keys(user_name) with Ddxoft(pid) as dd_opt: driver.execute_script('username_input=document.getElementById("alias");' 'username_input.value="{0}";' 'username_input.focus()'.format(user_name)) dd_opt.dd_tab() sleep(0.5) for i in item['password']: dd_opt.dd_keyboard(i) sleep(0.1) if driver.find_element_by_id('validatePwd').is_displayed(): captcha_url = 'https://sbank.hxb.com.cn/easybanking/validateservlet' capcha_cookies = get_cookies_dict_from_webdriver(driver) capcha_body = get_content_by_requests(captcha_url, headers=self.headers, cookie_jar=capcha_cookies) captcha_code = self.ask_image_captcha(capcha_body, user_name) driver.execute_script('document.getElementById("verifyCode").value="{0}"'.format(captcha_code)) sleep(0.5) if 'validateNoError' in driver.find_element_by_id('valdtErr').get_attribute('src'): yield from self.error_handle(user_name, "华夏银行---登录失败:(username:%s, password:%s) %s" % (user_name, item["password"], '验证码输入错误'), tell_msg='验证码错误') return self.element_click_three_times(butt_submit) # driver.execute_script(butt_submit.get_attribute('onclick')) sleep(2) try: error_message = driver.find_element_by_id('mess') message = error_message.text yield from self.error_handle(user_name, "华夏银行---登录失败:(username:%s, password:%s) %s" % (user_name, item["password"], message), tell_msg=message) return except NoSuchElementException: pass self.element_click_three_times(driver.find_element_by_class_name('main_nav_1')) # my_account_onclick_js = driver.find_element_by_class_name('main_nav_1').get_attribute('onclick') # driver.execute_script(my_account_onclick_js) sleep(0.5) # account_detail_onclick_js = driver.find_element_by_xpath('//a[contains(text(),"账户明细查询")]').get_attribute('onclick') # driver.execute_script(account_detail_onclick_js) self.element_click_three_times(driver.find_element_by_xpath('//a[contains(text(),"账户明细查询")]')) from_day = (date.today() - self.date_delta).strftime("%Y%m%d") driver.execute_script('document.getElementsByName("queryStrDateYear")[0].value="%s";' 'document.getElementsByName("queryStrDateMonth")[0].value="%s";' 'document.getElementsByName("queryStrDateDay")[0].value="%s";' % (from_day[:4], from_day[4:6], from_day[6:8])) self.element_click_three_times(driver.find_element_by_id('form_submit')) sleep(0.5) page_source = driver.page_source result = self.__get_page_detail(page_source) for i in range(100): try: pageDown_btn = driver.find_element_by_xpath('//a[contains(text(),"下一页")]') self.element_click_three_times(pageDown_btn) sleep(1) page_source = driver.page_source result.extend(self.__get_page_detail(page_source)) except Exception: break try: pageDown_btn = driver.find_element_by_xpath('//a[contains(text(),"下一页")]') if pageDown_btn.get_attribute('disabled') == 'disabled': break except NoSuchElementException: break item['trade_records'] = result if 'balance' not in item and result: item['balance'] = result[0].get('trade_balance', 0) yield from self.crawling_done(item) except CaptchaTimeout: yield from self.error_handle(user_name, "华夏银行---等待验证码超时", tell_msg="等待验证码超时,请刷新页面重试。。") except Exception: yield from self.except_handle(user_name, "华夏银行---解析失败", tell_msg="解析失败") finally: driver.quit()
def parse_login(self, response): meta = response.meta item = meta["item"] # self.logger.debug(response.request.body.decode()) # self.logger.debug('header ' + str(response.headers)) if response.status != 302: if response.xpath('//div[@id="status"]/text()').extract_first(): yield from self.error_handle( item["username"], "%s 账号或密码错误" % item["username"], tell_msg=response.xpath( '//div[@id="status"]/text()').extract_first()) return if response.xpath('//input[@id="captcha"]').extract_first() \ or response.xpath('//div[@class="ct_input errors"]').extract_first(): meta['captcha_retry_time'] -= 1 if meta['captcha_retry_time'] < 0: yield from self.error_handle(item["username"], "%s 图片验证码请求五次,退出" % item["username"], tell_msg='验证码已刷新五次,请重试') return lt = response.xpath( '//input[@name="lt"]/@value').extract_first("") cookiejar = get_cookiejar_from_response(response) url = "https://account.chsi.com.cn/passport/captcha.image?id=" + str( random()) headers = meta['headers'] captcha_body = get_content_by_requests(url, headers, cookie_jar=cookiejar) captcha_code = self.ask_image_captcha(captcha_body, item['username'], file_type=".jpeg") req_data = self.get_req_data(self.user_login, user_name=item["username"], password=item["password"], lt=lt, captcha=captcha_code) try: headers['Cookie'] = response.headers.get( 'Set-Cookie').decode() except Exception: pass self.logger.debug(req_data) self.logger.debug(headers) r = FormRequest(headers=headers, url=self._start_url_, callback=self.parse_login, formdata=req_data, meta=meta, errback=self.err_callback, dont_filter=True) yield r else: yield from self.error_handle(item["username"], "%s 账号或密码错误" % item["username"], tell_msg='账号或密码错误') return else: try: get_jsession_url = response.headers.get('Location') if get_jsession_url: get_jsession_url = get_jsession_url.decode() self.logger.info("请求获取sessionid接口->%s" % get_jsession_url) headers = meta['headers'] headers['Referer'] = response.url yield Request(headers=meta['headers'], url=get_jsession_url, callback=self.parse_getJsession, meta=meta, errback=self.err_callback, dont_filter=True) else: yield from self.error_handle(item["username"], "%s 账号或密码错误" % item["username"], tell_msg='账号或密码错误') except Exception: yield from self.except_handle(item["username"], "学信网---登录数据解析异常")