def _get_callDetail_image_captcha(self, response, username): cookiejar = get_cookiejar_from_response(response) headers = get_headers_from_response(response) url = "http://shop.10086.cn/i/authImg?t=" + str(rand_0_1()) self.set_image_captcha_headers_to_ssdb(headers, username) return get_content_by_requests(url, headers=headers, cookie_jar=cookiejar)
def parse(self, response): headers = self.headers.copy() meta = response.meta meta['headers'] = headers meta['captcha_retry_time'] = 5 item = meta["item"] item['xueli'] = [] if response.status == 302: yield from self.parse_login(response) else: try: self.logger.info("请求登录接口->%s" % self.user_login) lt = response.xpath( '//input[@name="lt"]/@value').extract_first("") captcha_code = None # self.logger.debug('captcha1 ' + str(response.xpath('//input[@id="captcha"]').extract_first(""))) # self.logger.debug('captcha2 ' + str(response.xpath('//div[@class="ct_input errors"]').extract_first(""))) if response.xpath('//input[@id="captcha"]').extract_first() \ or response.xpath('//div[@class="ct_input errors"]').extract_first(): meta['captcha_retry_time'] -= 1 cookiejar = get_cookiejar_from_response(response) url = "https://account.chsi.com.cn/passport/captcha.image?id=" + str( random()) captcha_body = get_content_by_requests( url, headers, cookie_jar=cookiejar, proxies={ "https": response.meta['proxy'], "http": response.meta['proxy'] }) captcha_code = self.ask_image_captcha(captcha_body, item['username'], file_type=".jpeg") req_data = self.get_req_data(self.user_login, user_name=item["username"], password=item["password"], lt=lt, captcha=captcha_code) self.logger.debug(req_data) headers['Cookie'] = response.headers.get('Set-Cookie').decode() headers['Referer'] = self._start_url_ r = FormRequest(headers=headers, url=self._start_url_, callback=self.parse_login, formdata=req_data, meta=meta, errback=self.err_callback, dont_filter=True) yield r except Exception: yield from self.except_handle(meta["item"]["username"], "学信网---爬虫解析入口异常")
def _verify_callDetail_captcha(self, response, username, captcha_code): """ 返回是否正确 """ cookiejar = get_cookiejar_from_response(response) headers = get_headers_from_response(response) url = "http://shop.10086.cn/i/v1/res/precheck/" + username + "?captchaVal=" \ + captcha_code + "&_=" + get_js_time() info = get_content_by_requests(url, headers=headers, cookie_jar=cookiejar) return b'"retCode":"000000"' in info
def verify_captcha(self, response, captcha_code): """ 移动有一个url可以在不提交表单的情况下,先检查验证码是否正确 返回是否正确 """ cookiejar = get_cookiejar_from_response(response) headers = get_headers_from_response(response) url = b"https://login.10086.cn/verifyCaptcha?inputCode=" \ + captcha_code.encode('unicode-escape', "ignore").replace(b"\\u", b"") info = get_content_by_requests(url, headers=headers, cookie_jar=cookiejar) return b'"resultCode":"0"' in info
def check_need_sms_captcha(self, response, username): cookiejar = get_cookiejar_from_response(response) headers = get_headers_from_response(response) form_data = { "accountType": "01", "account": username, "timestamp": get_js_time(), # "pwdType": "02", } info = get_content_by_requests( "https://login.10086.cn/needVerifyCode.htm?" + urlencode(form_data), headers=headers, cookie_jar=cookiejar) return b'"needVerifyCode":"1"' in info
def get_unisecid_request(self, response): cookiejar = get_cookiejar_from_response(response) headers = get_headers_from_response(response) the_time = get_js_time() url = "https://uac.10010.com/oauth2/genqr?" + the_time r = get_response_by_requests(url, headers=headers, cookie_jar=cookiejar) cookie = r.headers.get('Set-Cookie') result = {} k_v_list = cookie.split(';') name, value = k_v_list[0].split('=') result['name'] = name result['value'] = value for k_v in islice(k_v_list, 1, None): k, v = k_v.split('=', 1) result[k] = v return result
def parse(self, response): meta = response.meta item = meta["item"] username = item["username"] password = item["password"] try: cookiejar = get_cookiejar_from_response(response) headers = get_headers_from_response(response) headers['Referer'] = response.url url = "https://ipcrs.pbccrc.org.cn/imgrc.do?" + get_js_time() captcha_body = get_content_by_requests_post(url, headers=headers, cookie_jar=cookiejar) captcha_code = self.parse_capatcha(captcha_body) self.logger.info("验证码识别结果:%s" % captcha_code) token = response.xpath( "//input[@name='org.apache.struts.taglib.html.TOKEN']/@value" ).extract_first("") date = response.xpath( "//input[@name='date']/@value").extract_first("") datas = { "org.apache.struts.taglib.html.TOKEN": token, "method": "login", "date": date, "loginname": username, "password": password, "_@IMGRC@_": captcha_code } yield FormRequest("https://ipcrs.pbccrc.org.cn/login.do", headers=self.headers, formdata=datas, callback=self.parse_login, meta=meta, dont_filter=True) except Exception: yield from self.except_handle(username, msg="人行征信---登录入口解析失败", tell_msg="个人信息报告数据爬取失败,请刷新页面重试!")
def request_sms_code(self, response, username): """ 请求移动发送登录的短信验证码 """ cookiejar = get_cookiejar_from_response(response) headers = get_headers_from_response(response) form_data = {"userName": username} # # 不知道这是干嘛的 # url = "https://login.10086.cn/chkNumberAction.action" # info = get_content_by_requests_post(url, data=form_data, # headers=headers, cookie_jar=cookiejar) form_data.update({"type": "01", "channelID": "12003"}) url = "https://login.10086.cn/sendRandomCodeAction.action" info = get_content_by_requests_post(url, data=form_data, headers=headers, cookie_jar=cookiejar) return info == b'0'
def get_need_captcha_response(self, response, username, pwd_type="02"): """ 询问是否需要验证码 """ cookiejar = get_cookiejar_from_response(response) headers = get_headers_from_response(response) the_time = get_js_time() form_data = { 'userName': username, 'pwdType': pwd_type, '_': int(the_time) + 1, 'callback': "jQuery1720" + str(randint(1E16, 1E17 - 1)) + "_" + the_time } url = "http://uac.10010.com/portal/Service/CheckNeedVerify?" + urlencode( form_data) return get_response_by_requests(url, headers=headers, cookie_jar=cookiejar)
def parse_login(self, response): meta = response.meta item = meta["item"] # self.logger.debug(response.request.body.decode()) # self.logger.debug('header ' + str(response.headers)) if response.status != 302: if response.xpath('//div[@id="status"]/text()').extract_first(): yield from self.error_handle( item["username"], "%s 账号或密码错误" % item["username"], tell_msg=response.xpath( '//div[@id="status"]/text()').extract_first()) return if response.xpath('//input[@id="captcha"]').extract_first() \ or response.xpath('//div[@class="ct_input errors"]').extract_first(): meta['captcha_retry_time'] -= 1 if meta['captcha_retry_time'] < 0: yield from self.error_handle(item["username"], "%s 图片验证码请求五次,退出" % item["username"], tell_msg='验证码已刷新五次,请重试') return lt = response.xpath( '//input[@name="lt"]/@value').extract_first("") cookiejar = get_cookiejar_from_response(response) url = "https://account.chsi.com.cn/passport/captcha.image?id=" + str( random()) headers = meta['headers'] captcha_body = get_content_by_requests(url, headers, cookie_jar=cookiejar) captcha_code = self.ask_image_captcha(captcha_body, item['username'], file_type=".jpeg") req_data = self.get_req_data(self.user_login, user_name=item["username"], password=item["password"], lt=lt, captcha=captcha_code) try: headers['Cookie'] = response.headers.get( 'Set-Cookie').decode() except Exception: pass self.logger.debug(req_data) self.logger.debug(headers) r = FormRequest(headers=headers, url=self._start_url_, callback=self.parse_login, formdata=req_data, meta=meta, errback=self.err_callback, dont_filter=True) yield r else: yield from self.error_handle(item["username"], "%s 账号或密码错误" % item["username"], tell_msg='账号或密码错误') return else: try: get_jsession_url = response.headers.get('Location') if get_jsession_url: get_jsession_url = get_jsession_url.decode() self.logger.info("请求获取sessionid接口->%s" % get_jsession_url) headers = meta['headers'] headers['Referer'] = response.url yield Request(headers=meta['headers'], url=get_jsession_url, callback=self.parse_getJsession, meta=meta, errback=self.err_callback, dont_filter=True) else: yield from self.error_handle(item["username"], "%s 账号或密码错误" % item["username"], tell_msg='账号或密码错误') except Exception: yield from self.except_handle(item["username"], "学信网---登录数据解析异常")